diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip index 1b53ebec9b5821..63572bfd242b98 100644 --- a/clang/test/CodeGenHIP/default-attributes.hip +++ b/clang/test/CodeGenHIP/default-attributes.hip @@ -2,44 +2,55 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \ // RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPTNONE %s +// RUN: %clang_cc1 -O3 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \ +// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPT %s + #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) -//. -// OPTNONE: @__hip_cuid_ = addrspace(1) global i8 0 -// OPTNONE: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" -// OPTNONE: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 -//. -__device__ void extern_func(); - // OPTNONE: Function Attrs: convergent mustprogress noinline nounwind optnone // OPTNONE-LABEL: define {{[^@]+}}@_Z4funcv // OPTNONE-SAME: () #[[ATTR0:[0-9]+]] { // OPTNONE-NEXT: entry: -// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3:[0-9]+]] // OPTNONE-NEXT: ret void // +// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// OPT-LABEL: define {{[^@]+}}@_Z4funcv +// OPT-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] { +// OPT-NEXT: entry: +// OPT-NEXT: ret void +// __device__ void func() { - extern_func(); + } // OPTNONE: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone // OPTNONE-LABEL: define {{[^@]+}}@_Z6kernelv -// OPTNONE-SAME: () #[[ATTR2:[0-9]+]] { +// OPTNONE-SAME: () #[[ATTR1:[0-9]+]] { // OPTNONE-NEXT: entry: -// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3]] // OPTNONE-NEXT: ret void // +// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// OPT-LABEL: define {{[^@]+}}@_Z6kernelv +// OPT-SAME: () local_unnamed_addr #[[ATTR1:[0-9]+]] { +// OPT-NEXT: entry: +// OPT-NEXT: ret void +// __global__ void kernel() { - extern_func(); + } //. -// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } -// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind } +// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// OPTNONE: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4} //. -// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} -// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// OPT: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPT: !2 = !{i32 1, !"wchar_size", i32 4} //. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index bcfb31f6f6e758..d2529e152052f3 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -135,10 +135,6 @@ Changes to the AMDGPU Backend :ref:`atomicrmw ` instruction with `fadd`, `fmin` and `fmax` with addrspace(3) instead. -* AMDGPUAttributor is no longer run as part of the codegen pass - pipeline. It is expected to run as part of the middle end - optimizations. - Changes to the ARM Backend -------------------------- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 7354539d533c4f..b6766bc34e5625 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -796,14 +796,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); }); - // FIXME: Why is AMDGPUAttributor not in CGSCC? - PB.registerOptimizerLastEPCallback( - [this](ModulePassManager &MPM, OptimizationLevel Level) { - if (Level != OptimizationLevel::O0) { - MPM.addPass(AMDGPUAttributorPass(*this)); - } - }); - PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { // We want to support the -lto-partitions=N option as "best effort". @@ -1144,6 +1136,11 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPULowerModuleLDSLegacyPass(&TM)); } + // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run + // after their introduction + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(createAMDGPUAttributorLegacyPass()); + if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(createInferAddressSpacesPass()); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 43437323188c19..96c7447695dfc0 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -865,12 +865,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, break; } } - - // FIXME: We can spill incoming arguments and restore at the end of the - // prolog. - if (!ScratchWaveOffsetReg) - report_fatal_error( - "could not find temporary scratch offset register in prolog"); } else { ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll index 359c1e53de99e3..a38b6e3263882c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_add_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 @@ -22,8 +22,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: s_add_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] @@ -58,8 +58,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: s_sub_u64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s0, s6, s0 @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: s_sub_u64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index c701e873fdd2c5..cd1209a87031c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -2038,7 +2038,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 @@ -2068,7 +2068,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -2095,7 +2095,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 @@ -2126,14 +2126,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2159,11 +2155,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 @@ -2189,11 +2181,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2220,11 +2208,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2251,11 +2235,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2290,7 +2270,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_num_f32 v3, v0, v0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2318,7 +2298,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2344,7 +2324,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_f32 v3, v0, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2372,14 +2352,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2404,11 +2380,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2433,11 +2405,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2463,11 +2431,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2493,11 +2457,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2531,7 +2491,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -2562,7 +2522,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v1 ; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen @@ -2591,7 +2551,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -2623,15 +2583,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2660,11 +2616,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2692,11 +2644,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2727,11 +2675,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2762,11 +2706,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2805,7 +2745,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2835,7 +2775,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2862,7 +2802,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2892,14 +2832,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2927,11 +2863,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2957,11 +2889,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2990,11 +2918,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -3023,11 +2947,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 90110e6e0c09ec..a35c8b1e496bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -2038,7 +2038,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v3, v1, v1 @@ -2068,7 +2068,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: v_mov_b32_e32 v1, v0 ; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 @@ -2095,7 +2095,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v3, v1, v1 @@ -2126,14 +2126,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2159,11 +2155,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 @@ -2189,11 +2181,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2220,11 +2208,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -2251,11 +2235,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, v0 ; GFX7-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2290,7 +2270,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_num_f32 v3, v0, v0 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2318,7 +2298,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2344,7 +2324,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_max_f32 v3, v0, v0 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2372,14 +2352,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2404,11 +2380,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2433,11 +2405,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 @@ -2463,11 +2431,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2493,11 +2457,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 @@ -2531,7 +2491,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen @@ -2562,7 +2522,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v1 ; GFX940-NEXT: buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen @@ -2591,7 +2551,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen @@ -2623,15 +2583,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen ; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2660,11 +2616,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2692,11 +2644,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2727,11 +2675,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2762,11 +2706,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen @@ -2805,7 +2745,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, s6 +; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen @@ -2835,7 +2775,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v6, s6 +; GFX940-NEXT: v_mov_b32_e32 v6, s4 ; GFX940-NEXT: buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2862,7 +2802,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen @@ -2892,14 +2832,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s16 -; GFX10-NEXT: s_mov_b32 s7, s17 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2927,11 +2863,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, s6 -; GFX90A-NEXT: s_mov_b32 s5, s7 -; GFX90A-NEXT: s_mov_b32 s6, s16 -; GFX90A-NEXT: s_mov_b32 s7, s17 -; GFX90A-NEXT: v_mov_b32_e32 v6, s18 +; GFX90A-NEXT: v_mov_b32_e32 v6, s8 ; GFX90A-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] @@ -2957,11 +2889,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s4, s6 -; GFX908-NEXT: s_mov_b32 s5, s7 -; GFX908-NEXT: s_mov_b32 s6, s16 -; GFX908-NEXT: s_mov_b32 s7, s17 -; GFX908-NEXT: v_mov_b32_e32 v6, s18 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX908-NEXT: s_mov_b64 s[8:9], 0 @@ -2990,11 +2918,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s6 -; GFX8-NEXT: s_mov_b32 s5, s7 -; GFX8-NEXT: s_mov_b32 s6, s16 -; GFX8-NEXT: s_mov_b32 s7, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b64 s[8:9], 0 @@ -3023,11 +2947,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s7 -; GFX7-NEXT: s_mov_b32 s6, s16 -; GFX7-NEXT: s_mov_b32 s7, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v6, s8 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen ; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] ; GFX7-NEXT: s_mov_b64 s[8:9], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index 705bcbddf227a6..b04bc04ab22691 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,11 +74,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -95,8 +95,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -110,8 +110,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,8 +125,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -138,11 +138,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -153,11 +153,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -197,7 +197,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -218,7 +218,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_dec_u32 v0, v1 @@ -232,7 +232,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -254,7 +254,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -264,7 +264,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_dec_u32 v1, v0 offset:16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -320,7 +320,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -332,7 +332,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -345,7 +345,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc @@ -364,7 +364,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -381,7 +381,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -410,7 +410,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -443,7 +443,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -460,7 +460,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -502,7 +502,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -569,7 +569,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] @@ -584,7 +584,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -598,7 +598,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -612,7 +612,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -635,7 +635,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -651,7 +651,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -665,7 +665,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -679,7 +679,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -690,7 +690,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -702,7 +702,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -740,7 +740,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -762,7 +762,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -774,7 +774,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -787,10 +787,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -812,7 +810,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -829,7 +827,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -846,7 +844,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -857,7 +855,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -869,10 +867,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -889,7 +885,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -904,7 +900,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -919,7 +915,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -934,7 +930,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -950,7 +946,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -969,7 +965,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -986,7 +982,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1003,7 +999,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1018,7 +1014,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1036,7 +1032,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1056,7 +1052,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1073,7 +1069,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1090,7 +1086,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1105,7 +1101,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -1123,7 +1119,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1143,7 +1139,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1155,7 +1151,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1163,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1179,7 +1175,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1193,7 +1189,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1210,7 +1206,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1224,7 +1220,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1238,7 +1234,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1250,7 +1246,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1266,7 +1262,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1284,7 +1280,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1298,7 +1294,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1312,7 +1308,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1324,7 +1320,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -1340,7 +1336,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -1358,7 +1354,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1380,7 +1376,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1402,7 +1398,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1422,7 +1418,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1445,14 +1441,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1476,7 +1470,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1493,7 +1487,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1510,7 +1504,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1525,7 +1519,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1544,14 +1538,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20 @@ -1570,7 +1562,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1591,7 +1583,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1612,7 +1604,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1628,7 +1620,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,7 +1637,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1665,7 +1657,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1688,7 +1680,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1711,7 +1703,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1727,7 +1719,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1738,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -1767,7 +1759,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1780,7 +1772,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1793,7 +1785,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1798,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1821,7 +1813,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1839,7 +1831,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,7 +1846,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1869,7 +1861,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1882,7 +1874,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1899,7 +1891,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1918,7 +1910,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,7 +1925,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1948,7 +1940,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1961,7 +1953,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1978,7 +1970,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1997,7 +1989,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2023,7 +2015,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2049,7 +2041,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2070,7 +2062,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -2094,15 +2086,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2126,7 +2117,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2144,7 +2135,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2162,7 +2153,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2178,7 +2169,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2198,15 +2189,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2224,7 +2214,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2242,7 +2232,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_shl_base_lds_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2260,7 +2250,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_dec_shl_base_lds_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2276,7 +2266,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 @@ -2289,10 +2279,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: atomic_dec_shl_base_lds_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8 @@ -2317,8 +2305,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2333,8 +2321,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2349,8 +2337,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2363,12 +2351,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2379,12 +2367,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -2401,8 +2389,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2417,8 +2405,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2433,8 +2421,8 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2447,12 +2435,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2463,12 +2451,12 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -2486,7 +2474,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2498,7 +2486,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2510,7 +2498,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2521,7 +2509,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2533,7 +2521,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2548,7 +2536,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -2560,7 +2548,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -2572,7 +2560,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2583,7 +2571,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2595,7 +2583,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -2611,7 +2599,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2627,7 +2615,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2643,7 +2631,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_dec_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2656,7 +2644,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_dec_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2670,7 +2658,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_dec_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2690,7 +2678,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2708,7 +2696,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2726,7 +2714,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2739,7 +2727,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2753,7 +2741,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2774,7 +2762,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2792,7 +2780,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2810,7 +2798,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2823,7 +2811,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2837,7 +2825,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2858,7 +2846,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2871,7 +2859,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2884,7 +2872,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_dec_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2896,7 +2884,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_dec_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2909,7 +2897,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_dec_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2925,7 +2913,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2940,7 +2928,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2955,7 +2943,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2967,7 +2955,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2980,7 +2968,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2997,7 +2985,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3012,7 +3000,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3027,7 +3015,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3039,7 +3027,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -3052,7 +3040,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3069,7 +3057,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3092,7 +3080,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3115,7 +3103,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3128,7 +3116,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -3142,17 +3130,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3168,7 +3154,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3186,7 +3172,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3204,7 +3190,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3216,7 +3202,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -3229,13 +3215,11 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:40 +; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3250,7 +3234,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -3269,7 +3253,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -3289,7 +3273,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3306,7 +3290,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -3319,21 +3303,18 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 9 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 9 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v3, v[0:1] offset:16 +; GFX11-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index b3a7e65f771c43..f6a997fb0fb01b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -16,8 +16,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -31,8 +31,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,8 +46,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -59,11 +59,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -74,11 +74,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -95,8 +95,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -110,8 +110,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,8 +125,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -138,11 +138,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -153,11 +153,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -197,7 +197,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -218,7 +218,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_inc_u32 v0, v1 @@ -232,7 +232,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -254,7 +254,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -264,7 +264,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: ds_inc_u32 v1, v0 offset:16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -305,7 +305,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -320,7 +320,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -332,7 +332,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -345,7 +345,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc @@ -364,7 +364,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -381,7 +381,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -410,7 +410,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -443,7 +443,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -460,7 +460,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -489,7 +489,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -502,7 +502,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -569,7 +569,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] @@ -584,7 +584,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -598,7 +598,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -612,7 +612,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -635,7 +635,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -651,7 +651,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -665,7 +665,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -679,7 +679,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -690,7 +690,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -702,7 +702,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -740,7 +740,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -762,7 +762,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -774,7 +774,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -787,10 +787,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -812,7 +810,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -829,7 +827,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -846,7 +844,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -857,7 +855,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -869,10 +867,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -889,7 +885,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; CI-NEXT: v_mov_b32_e32 v2, 9 ; CI-NEXT: s_mov_b32 m0, -1 @@ -907,7 +903,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 9 ; VI-NEXT: s_mov_b32 m0, -1 @@ -925,7 +921,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -941,7 +937,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 9 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 @@ -954,10 +950,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 @@ -982,8 +976,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -998,8 +992,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: lds_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1014,8 +1008,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: lds_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1028,12 +1022,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: lds_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,12 +1038,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: lds_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -1066,8 +1060,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1082,8 +1076,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: lds_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1098,8 +1092,8 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1112,12 +1106,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX10-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1128,12 +1122,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; ; GFX11-LABEL: lds_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -1151,7 +1145,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1163,7 +1157,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; VI-LABEL: lds_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1175,7 +1169,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX9-LABEL: lds_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1186,7 +1180,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX10-LABEL: lds_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1198,7 +1192,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; ; GFX11-LABEL: lds_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1213,7 +1207,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) #1 { ; CI-LABEL: lds_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x0 +; CI-NEXT: s_load_dword s0, s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1225,7 +1219,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; VI-LABEL: lds_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 @@ -1237,7 +1231,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1248,7 +1242,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX10-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,7 +1254,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; ; GFX11-LABEL: lds_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 @@ -1276,7 +1270,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1292,7 +1286,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1308,7 +1302,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: global_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1321,7 +1315,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: global_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1335,7 +1329,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1355,7 +1349,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1373,7 +1367,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1391,7 +1385,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1404,7 +1398,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1418,7 +1412,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1439,7 +1433,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1457,7 +1451,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1475,7 +1469,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1488,7 +1482,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1502,7 +1496,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1523,7 +1517,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1536,7 +1530,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1549,7 +1543,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX9-LABEL: global_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1561,7 +1555,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX10-LABEL: global_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1574,7 +1568,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; ; GFX11-LABEL: global_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1590,7 +1584,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1605,7 +1599,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,7 +1614,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1632,7 +1626,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1645,7 +1639,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1662,7 +1656,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1677,7 +1671,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1692,7 +1686,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1704,7 +1698,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1717,7 +1711,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,7 +1728,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1757,7 +1751,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1780,7 +1774,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1793,7 +1787,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 @@ -1807,17 +1801,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 glc +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1833,7 +1825,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1851,7 +1843,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1869,7 +1861,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1881,7 +1873,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1894,13 +1886,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 +; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -1915,7 +1905,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1930,7 +1920,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1945,7 +1935,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1960,7 +1950,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -1976,7 +1966,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1995,7 +1985,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2012,7 +2002,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2029,7 +2019,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2044,7 +2034,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2062,7 +2052,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2082,7 +2072,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2099,7 +2089,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2116,7 +2106,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2131,7 +2121,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 @@ -2149,7 +2139,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2169,7 +2159,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2181,7 +2171,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2193,7 +2183,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2205,7 +2195,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2219,7 +2209,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2236,7 +2226,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2250,7 +2240,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2264,7 +2254,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2276,7 +2266,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2292,7 +2282,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2310,7 +2300,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2324,7 +2314,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2338,7 +2328,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2350,7 +2340,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 @@ -2366,7 +2356,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -2384,7 +2374,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2406,7 +2396,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2428,7 +2418,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2448,7 +2438,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,14 +2461,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2502,7 +2490,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2519,7 +2507,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2536,7 +2524,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2551,7 +2539,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -2570,14 +2558,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 @@ -2596,7 +2582,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -2615,7 +2601,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -2635,7 +2621,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2652,7 +2638,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -2665,21 +2651,18 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; ; GFX11-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 9 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 9 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16 +; GFX11-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2695,7 +2678,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2716,7 +2699,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2737,7 +2720,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2753,7 +2736,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2770,7 +2753,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_ret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2790,7 +2773,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2813,7 +2796,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2836,7 +2819,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2852,7 +2835,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2871,7 +2854,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2892,7 +2875,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2915,7 +2898,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2938,7 +2921,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2954,7 +2937,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2973,7 +2956,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 @@ -2994,7 +2977,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3007,7 +2990,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3020,7 +3003,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3033,7 +3016,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3048,7 +3031,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3066,7 +3049,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3081,7 +3064,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3096,7 +3079,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3109,7 +3092,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3126,7 +3109,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3145,7 +3128,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3160,7 +3143,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3175,7 +3158,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,7 +3171,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3205,7 +3188,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3224,7 +3207,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3250,7 +3233,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3276,7 +3259,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3297,7 +3280,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -3321,15 +3304,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3353,7 +3335,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -3371,7 +3353,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3389,7 +3371,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3405,7 +3387,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -3425,15 +3407,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -3451,12 +3432,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #1 { ; CI-LABEL: nocse_lds_atomic_inc_ret_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dword s6, s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v1, s6 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 @@ -3471,12 +3452,12 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 @@ -3491,11 +3472,11 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 @@ -3507,11 +3488,11 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3526,10 +3507,10 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; ; GFX11-LABEL: nocse_lds_atomic_inc_ret_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll index c45bccd184c12f..bb5ccc3657dc4d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -66,7 +66,7 @@ define amdgpu_ps i32 @select_sgpr_trunc_and_cond(i32 inreg %a.0, i32 inreg %a.1, define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; WAVE64-LABEL: sgpr_trunc_brcond: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dword s0, s[2:3], 0x24 +; WAVE64-NEXT: s_load_dword s0, s[0:1], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 ; WAVE64-NEXT: s_and_b32 s0, s0, 1 @@ -83,7 +83,7 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; ; WAVE32-LABEL: sgpr_trunc_brcond: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dword s0, s[2:3], 0x24 +; WAVE32-NEXT: s_load_dword s0, s[0:1], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 ; WAVE32-NEXT: s_and_b32 s0, s0, 1 @@ -113,7 +113,7 @@ bb1: define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; WAVE64-LABEL: brcond_sgpr_trunc_and: ; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_and_b32 s0, s0, s1 ; WAVE64-NEXT: s_xor_b32 s0, s0, 1 @@ -131,7 +131,7 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; ; WAVE32-LABEL: brcond_sgpr_trunc_and: ; WAVE32: ; %bb.0: ; %entry -; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_and_b32 s0, s0, s1 ; WAVE32-NEXT: s_xor_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 51ef21c2decb7c..d8e1c9622c9476 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -12,9 +12,9 @@ declare hidden void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) define amdgpu_kernel void @kernel_caller_stack() { ; MUBUF-LABEL: kernel_caller_stack: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_mov_b32 s32, 0 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 @@ -34,8 +34,8 @@ define amdgpu_kernel void @kernel_caller_stack() { ; FLATSCR-LABEL: kernel_caller_stack: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_mov_b32 s32, 0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 @@ -60,9 +60,9 @@ define amdgpu_kernel void @kernel_caller_stack() { define amdgpu_kernel void @kernel_caller_byval() { ; MUBUF-LABEL: kernel_caller_byval: ; MUBUF: ; %bb.0: -; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -155,9 +155,9 @@ define amdgpu_kernel void @kernel_caller_byval() { ; ; FLATSCR-LABEL: kernel_caller_byval: ; FLATSCR: ; %bb.0: -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 405b1e8f3a250f..eb20178f9f4d88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,7 +452,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -468,7 +468,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -513,7 +513,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -539,7 +539,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -589,7 +589,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -644,7 +644,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -679,7 +679,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -725,14 +725,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -769,17 +769,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -821,7 +821,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -858,7 +858,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -949,7 +949,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -986,7 +986,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1005,7 +1005,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1204,7 +1204,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1221,7 +1221,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1310,7 +1310,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1337,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1354,7 +1354,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 5515de0cd2fee1..78d908455e019b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -193,7 +193,7 @@ bb12: define amdgpu_kernel void @break_loop(i32 %arg) { ; CHECK-LABEL: break_loop: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll index c510c5972cf21d..6463db49b78e77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -25,8 +25,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -42,7 +42,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -143,8 +143,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -160,8 +160,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -177,7 +177,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s32, 16 ; GFX11-NEXT: s_mov_b32 s33, 0 @@ -278,8 +278,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() { define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -296,8 +296,8 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_movk_i32 s32, 0x400 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -314,7 +314,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { ; ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 32 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s33, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 34efb089b72bf1..1e1c90d142a1f3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3037,21 +3037,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 6 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 @@ -3085,8 +3085,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 ; GPRIDX-NEXT: s_mov_b32 s4, 0 ; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s2, 0 @@ -3128,21 +3128,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 6 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 2 +; MOVREL-NEXT: enable_vgpr_workitem_id = 0 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3158,7 +3158,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 ; MOVREL-NEXT: wavefront_sgpr_count = 9 ; MOVREL-NEXT: workitem_vgpr_count = 4 @@ -3176,8 +3176,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s2, 0 @@ -3209,7 +3209,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -3220,21 +3220,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 6 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 2 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3250,9 +3250,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 7 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -3269,21 +3269,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_mov_b32 s5, 0x40140000 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] -; GFX10-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10-NEXT: s_cmp_eq_u32 s6, 4 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -3312,21 +3312,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 13 +; GFX11-NEXT: user_sgpr_count = 15 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 2 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 28 +; GFX11-NEXT: kernarg_segment_byte_size = 12 ; GFX11-NEXT: workgroup_fbarrier_count = 0 ; GFX11-NEXT: wavefront_sgpr_count = 7 ; GFX11-NEXT: workitem_vgpr_count = 3 @@ -3361,8 +3361,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -4054,21 +4054,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 6 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4084,9 +4084,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 12 +; GPRIDX-NEXT: wavefront_sgpr_count = 10 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4102,8 +4102,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s2, s[6:7], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GPRIDX-NEXT: s_load_dword s2, s[4:5], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1 @@ -4138,21 +4138,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 6 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 2 +; MOVREL-NEXT: enable_vgpr_workitem_id = 0 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4168,9 +4168,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 8 +; MOVREL-NEXT: wavefront_sgpr_count = 6 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4186,8 +4186,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s2, s[6:7], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_load_dword s2, s[4:5], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4223,21 +4223,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 6 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 2 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4253,9 +4253,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 8 +; GFX10-NEXT: wavefront_sgpr_count = 6 ; GFX10-NEXT: workitem_vgpr_count = 2 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4272,8 +4272,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 @@ -4308,21 +4308,21 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 13 +; GFX11-NEXT: user_sgpr_count = 15 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 2 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4338,9 +4338,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 28 +; GFX11-NEXT: kernarg_segment_byte_size = 12 ; GFX11-NEXT: workgroup_fbarrier_count = 0 -; GFX11-NEXT: wavefront_sgpr_count = 5 +; GFX11-NEXT: wavefront_sgpr_count = 4 ; GFX11-NEXT: workitem_vgpr_count = 2 ; GFX11-NEXT: reserved_vgpr_first = 0 ; GFX11-NEXT: reserved_vgpr_count = 0 @@ -4357,16 +4357,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 1 -; GFX11-NEXT: s_cselect_b32 s2, 2.0, 1.0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 2 -; GFX11-NEXT: s_cselect_b32 s2, 0x40400000, s2 -; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: s_cselect_b32 s2, 4.0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s3, 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 2 +; GFX11-NEXT: s_cselect_b32 s3, 0x40400000, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, 3 +; GFX11-NEXT: s_cselect_b32 s2, 4.0, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -4401,21 +4401,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 10 +; GPRIDX-NEXT: user_sgpr_count = 6 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 -; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 -; GPRIDX-NEXT: enable_vgpr_workitem_id = 2 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 ; GPRIDX-NEXT: enable_exception_msb = 0 ; GPRIDX-NEXT: granulated_lds_size = 0 ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4431,9 +4431,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 13 +; GPRIDX-NEXT: wavefront_sgpr_count = 11 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4449,17 +4449,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dword s8, s[6:7], 0x8 -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8 +; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_mov_b32 s2, 0 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 3 ; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 @@ -4477,7 +4477,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4488,21 +4488,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 10 +; MOVREL-NEXT: user_sgpr_count = 6 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 -; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 -; MOVREL-NEXT: enable_vgpr_workitem_id = 2 +; MOVREL-NEXT: enable_vgpr_workitem_id = 0 ; MOVREL-NEXT: enable_exception_msb = 0 ; MOVREL-NEXT: granulated_lds_size = 0 ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 ; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4518,9 +4518,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 9 +; MOVREL-NEXT: wavefront_sgpr_count = 7 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4536,16 +4536,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dword s8, s[6:7], 0x8 -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; MOVREL-NEXT: s_load_dword s6, s[4:5], 0x8 +; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 @@ -4565,7 +4565,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: kernel_code_entry_byte_offset = 256 ; GFX10-NEXT: kernel_code_prefetch_byte_size = 0 ; GFX10-NEXT: granulated_workitem_vgpr_count = 0 -; GFX10-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX10-NEXT: granulated_wavefront_sgpr_count = 0 ; GFX10-NEXT: priority = 0 ; GFX10-NEXT: float_mode = 240 ; GFX10-NEXT: priv = 0 @@ -4576,21 +4576,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 10 +; GFX10-NEXT: user_sgpr_count = 6 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX10-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX10-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX10-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX10-NEXT: enable_sgpr_workgroup_info = 0 -; GFX10-NEXT: enable_vgpr_workitem_id = 2 +; GFX10-NEXT: enable_vgpr_workitem_id = 0 ; GFX10-NEXT: enable_exception_msb = 0 ; GFX10-NEXT: granulated_lds_size = 0 ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 -; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX10-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX10-NEXT: enable_sgpr_queue_ptr = 0 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX10-NEXT: enable_sgpr_dispatch_id = 1 +; GFX10-NEXT: enable_sgpr_dispatch_id = 0 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4606,9 +4606,9 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 -; GFX10-NEXT: wavefront_sgpr_count = 9 +; GFX10-NEXT: wavefront_sgpr_count = 7 ; GFX10-NEXT: workitem_vgpr_count = 3 ; GFX10-NEXT: reserved_vgpr_first = 0 ; GFX10-NEXT: reserved_vgpr_count = 0 @@ -4625,17 +4625,17 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: .end_amd_kernel_code_t ; GFX10-NEXT: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s8, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -4664,21 +4664,21 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_mem_ordered = 1 ; GFX11-NEXT: enable_fwd_progress = 0 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX11-NEXT: user_sgpr_count = 13 +; GFX11-NEXT: user_sgpr_count = 15 ; GFX11-NEXT: enable_trap_handler = 0 ; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_y = 1 -; GFX11-NEXT: enable_sgpr_workgroup_id_z = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 ; GFX11-NEXT: enable_sgpr_workgroup_info = 0 -; GFX11-NEXT: enable_vgpr_workitem_id = 2 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 ; GFX11-NEXT: enable_exception_msb = 0 ; GFX11-NEXT: granulated_lds_size = 0 ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 -; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 ; GFX11-NEXT: enable_sgpr_queue_ptr = 0 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GFX11-NEXT: enable_sgpr_dispatch_id = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 ; GFX11-NEXT: enable_sgpr_private_segment_size = 0 ; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 @@ -4694,7 +4694,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: workitem_private_segment_byte_size = 0 ; GFX11-NEXT: workgroup_group_segment_byte_size = 0 ; GFX11-NEXT: gds_segment_byte_size = 0 -; GFX11-NEXT: kernarg_segment_byte_size = 28 +; GFX11-NEXT: kernarg_segment_byte_size = 12 ; GFX11-NEXT: workgroup_fbarrier_count = 0 ; GFX11-NEXT: wavefront_sgpr_count = 7 ; GFX11-NEXT: workitem_vgpr_count = 3 @@ -4713,8 +4713,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: .end_amd_kernel_code_t ; GFX11-NEXT: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40080000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll index 9b9249b62b0bca..7cd99fcfd5e740 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -1,8 +1,6 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 < %s | llc -mcpu=gfx900 | FileCheck -check-prefixes=GCN,RW-FLAT %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx940 < %s | llc | FileCheck -check-prefixes=GCN,RO-FLAT %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,RW-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s ; Make sure flat_scratch_init is set diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a5e4151bf36958..63e7339d829e1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -8,9 +8,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -26,11 +26,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -46,7 +46,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -62,7 +62,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s1, s0, 2 @@ -105,10 +105,10 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 0, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -121,10 +121,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -139,7 +139,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -153,12 +152,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0, v1 @@ -168,10 +165,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -325,9 +321,9 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -345,11 +341,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -367,7 +363,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -385,7 +381,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -402,7 +398,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -434,8 +430,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -453,10 +449,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -475,7 +471,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -489,14 +484,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 @@ -506,13 +498,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS @@ -639,9 +630,9 @@ bb: define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -659,11 +650,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -681,7 +672,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -700,7 +691,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -718,7 +709,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -750,8 +741,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -769,10 +760,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -791,7 +782,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 15 @@ -806,15 +796,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 @@ -824,13 +811,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 +; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS @@ -959,8 +945,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -976,10 +962,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80 @@ -1128,9 +1114,9 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1143,11 +1129,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1160,8 +1146,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 @@ -1174,11 +1159,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1188,10 +1173,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 2d3b6ee3e9823a..632dbd45279fbe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -7,16 +7,14 @@ declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x h ; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) -declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) -declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -27,7 +25,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -43,7 +41,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -84,10 +82,10 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -109,10 +107,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm @@ -134,12 +132,12 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] ; GFX940-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret void @@ -156,56 +154,6 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> ret <2 x i16> %ret } -define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret void -} - -define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { -; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) - ret <2 x half> %ret -} - -define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: ds_pk_add_f16 v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_endpgm - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret void -} - -define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { -; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) - ret <2 x i16> %ret -} - define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 453b229bf62bd9..66b22bedaf0721 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -20,27 +20,26 @@ declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -72,12 +71,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -87,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -108,22 +107,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -155,12 +154,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -170,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -191,22 +190,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -238,12 +237,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -253,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -274,22 +273,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -321,12 +320,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -336,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -357,22 +356,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -404,12 +403,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -419,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -440,22 +439,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -487,12 +486,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -502,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -523,22 +522,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -570,12 +569,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -585,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -606,22 +605,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -653,12 +652,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -668,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -689,22 +688,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -736,12 +735,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -751,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -772,22 +771,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -819,12 +818,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -834,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -855,22 +854,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -902,12 +901,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -917,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -938,22 +937,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -985,12 +984,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1000,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -1021,7 +1020,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1030,7 +1029,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1044,7 +1043,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1053,7 +1052,7 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1067,7 +1066,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1076,7 +1075,7 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] @@ -1090,16 +1089,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: s_mov_b32 s0, s5 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1126,22 +1125,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1154,21 +1153,21 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1176,22 +1175,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1204,16 +1203,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: s_mov_b32 s0, s5 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB41_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1240,22 +1239,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1268,21 +1267,21 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1290,22 +1289,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1480,16 +1479,16 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: s_mov_b32 s0, s5 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s0, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB49_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 @@ -1514,22 +1513,22 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1542,7 +1541,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1566,7 +1565,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1583,7 +1582,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1595,7 +1594,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1612,7 +1611,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1637,7 +1636,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1761,7 +1760,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1770,7 +1769,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1803,7 +1802,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] @@ -1825,7 +1824,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -1842,7 +1841,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1851,7 +1850,7 @@ define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1884,7 +1883,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -1893,7 +1892,7 @@ define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -1923,119 +1922,47 @@ main_body: ret double %ret } -define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB63_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: ds_add_f64 v2, v[0:1] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB63_2: -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: local_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB63_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB63_2: -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret void -} - -define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: s_cbranch_execz .LBB63_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB65_2: +; GFX90A-NEXT: .LBB63_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: s_cbranch_execz .LBB63_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB65_2: +; GFX940-NEXT: .LBB63_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2045,91 +1972,91 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB66_2 +; GFX90A-NEXT: s_cbranch_execz .LBB64_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB66_2: +; GFX90A-NEXT: .LBB64_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB66_2 +; GFX940-NEXT: s_cbranch_execz .LBB64_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB66_2: +; GFX940-NEXT: .LBB64_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void } -define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { +define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #2 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s4, s1 -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB67_2 +; GFX90A-NEXT: s_cbranch_execz .LBB65_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB67_2: +; GFX90A-NEXT: .LBB65_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s4, s1 -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB67_2 +; GFX940-NEXT: s_cbranch_execz .LBB65_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB67_2: +; GFX940-NEXT: .LBB65_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2158,54 +2085,6 @@ main_body: ret double %ret } -define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - -define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, v2 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } -attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" } -attributes #3 = { "denormal-fp-math"="ieee,ieee" } -attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index e051cc28469fae..05cdb54f5dd747 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -37,8 +37,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -67,8 +67,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -87,8 +87,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -113,8 +113,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2 @@ -133,8 +133,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8 @@ -159,8 +159,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -188,8 +188,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -225,8 +225,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -243,8 +243,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -269,8 +269,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -287,8 +287,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -313,8 +313,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -341,8 +341,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -376,8 +376,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -433,8 +433,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -458,8 +458,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[6:7], 0x0 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 @@ -545,8 +545,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 @@ -588,8 +588,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 @@ -682,8 +682,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 @@ -747,8 +747,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 @@ -792,8 +792,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 @@ -845,8 +845,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 @@ -922,8 +922,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 @@ -1007,8 +1007,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 @@ -1050,8 +1050,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll index fe2e7afb7048ed..388ef2497e4356 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -952,9 +952,9 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p1) :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: %12:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) + ; CHECK-NEXT: %5:_(p5) = nuw nusw G_PTR_ADD [[COPY]], [[C]](s32) ; CHECK-NEXT: G_STORE [[LOAD]](s8), [[COPY]](p5) :: (store (s8) into %ir.arg0, addrspace 5) - ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %12(p5) :: (store (s32) into %ir.gep1, addrspace 5) + ; CHECK-NEXT: G_STORE [[LOAD1]](s32), %5(p5) :: (store (s32) into %ir.gep1, addrspace 5) ; CHECK-NEXT: SI_RETURN %val0 = load volatile i8, ptr addrspace(1) undef %val1 = load volatile i32, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 9443b39dcdc033..8859ac69923a99 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 @@ -83,7 +83,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 @@ -114,9 +114,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -127,9 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 @@ -152,7 +152,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -170,9 +170,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 @@ -183,9 +183,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 @@ -196,7 +196,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 @@ -226,12 +226,12 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { define amdgpu_kernel void @llvm_trap() { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8V4-NEXT: s_trap 2 ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll index 696cbdb75f1ed9..136c51d775b43c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm-mismatched-size.ll @@ -19,9 +19,6 @@ define amdgpu_kernel void @return_type_is_too_big_vector() { ; CHECK-LABEL: name: return_type_is_too_big_vector ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (%ir-block.0): ; CHECK-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index db944b98a30135..a1c99f5cf60297 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr addrspace(1) %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[20:23], s[6:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s13 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 5185f6c4ada5ba..3abc21f812e145 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 { ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0xf @@ -101,9 +101,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addr ; ; GFX11-LABEL: v_insert_v64i32_37: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0xf diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll index e67ada74c23e65..e9292f4e34dcda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel-system-sgprs.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=amdgpu-attributor < %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -o - | FileCheck -check-prefix=HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA %s ; HSA-LABEL: name: default_kernel ; HSA: liveins: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll index 652d22ac1224fc..f2fe815a71202c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -5,9 +5,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; HSA-VI-LABEL: name: i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -20,9 +20,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; LEGACY-MESA-VI-LABEL: name: i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -40,9 +40,9 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i8_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -55,9 +55,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; LEGACY-MESA-VI-LABEL: name: i8_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -75,9 +75,9 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; HSA-VI-LABEL: name: i8_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -90,9 +90,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; LEGACY-MESA-VI-LABEL: name: i8_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -110,9 +110,9 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; HSA-VI-LABEL: name: i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -125,9 +125,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -145,9 +145,9 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; HSA-VI-LABEL: name: i16_zext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -160,9 +160,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; LEGACY-MESA-VI-LABEL: name: i16_zext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -180,9 +180,9 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; HSA-VI-LABEL: name: i16_sext_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -195,9 +195,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; LEGACY-MESA-VI-LABEL: name: i16_sext_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -215,9 +215,9 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; HSA-VI-LABEL: name: i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -229,9 +229,9 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; LEGACY-MESA-VI-LABEL: name: i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -248,9 +248,9 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; HSA-VI-LABEL: name: f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -262,9 +262,9 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; LEGACY-MESA-VI-LABEL: name: f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -281,9 +281,9 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; HSA-VI-LABEL: name: v2i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -295,9 +295,9 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -314,9 +314,9 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; HSA-VI-LABEL: name: v2i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -328,9 +328,9 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v2i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -347,9 +347,9 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v2i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -361,9 +361,9 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v2i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -380,9 +380,9 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; HSA-VI-LABEL: name: v2f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -394,9 +394,9 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; LEGACY-MESA-VI-LABEL: name: v2f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -413,9 +413,9 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; HSA-VI-LABEL: name: v3i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -427,9 +427,9 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; LEGACY-MESA-VI-LABEL: name: v3i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -446,9 +446,9 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; HSA-VI-LABEL: name: v3i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -460,9 +460,9 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; LEGACY-MESA-VI-LABEL: name: v3i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v3i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -493,9 +493,9 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v3i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -512,9 +512,9 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; HSA-VI-LABEL: name: v3f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -526,9 +526,9 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; LEGACY-MESA-VI-LABEL: name: v3f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -545,9 +545,9 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; HSA-VI-LABEL: name: v4i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -559,9 +559,9 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -578,9 +578,9 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; HSA-VI-LABEL: name: v4i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -592,9 +592,9 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v4i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -611,9 +611,9 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v4i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -625,9 +625,9 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v4i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -644,9 +644,9 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; HSA-VI-LABEL: name: v4f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -658,9 +658,9 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; LEGACY-MESA-VI-LABEL: name: v4f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -677,9 +677,9 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; HSA-VI-LABEL: name: v8i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -691,9 +691,9 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -710,9 +710,9 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; HSA-VI-LABEL: name: v8i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -724,9 +724,9 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v8i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -743,9 +743,9 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v8i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -757,9 +757,9 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; LEGACY-MESA-VI-LABEL: name: v8i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -776,9 +776,9 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; HSA-VI-LABEL: name: v8f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -790,9 +790,9 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; LEGACY-MESA-VI-LABEL: name: v8f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -809,9 +809,9 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; HSA-VI-LABEL: name: v16i8_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -823,9 +823,9 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i8_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -842,9 +842,9 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; HSA-VI-LABEL: name: v16i16_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -856,9 +856,9 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; LEGACY-MESA-VI-LABEL: name: v16i16_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -875,9 +875,9 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; HSA-VI-LABEL: name: v16i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -889,9 +889,9 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; LEGACY-MESA-VI-LABEL: name: v16i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -908,9 +908,9 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; HSA-VI-LABEL: name: v16f32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -922,9 +922,9 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; LEGACY-MESA-VI-LABEL: name: v16f32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -941,9 +941,9 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; HSA-VI-LABEL: name: kernel_arg_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -955,9 +955,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; LEGACY-MESA-VI-LABEL: name: kernel_arg_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -973,9 +973,9 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; HSA-VI-LABEL: name: f64_kernel_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -987,9 +987,9 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; LEGACY-MESA-VI-LABEL: name: f64_kernel_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1006,9 +1006,9 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1020,9 +1020,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: i1_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1038,9 +1038,9 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1053,9 +1053,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1073,9 +1073,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_zext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1088,9 +1088,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1108,9 +1108,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i32 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1123,9 +1123,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i32 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1143,9 +1143,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; HSA-VI-LABEL: name: i1_arg_sext_i64 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1158,9 +1158,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i64 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1180,9 +1180,9 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1192,9 +1192,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; HSA-VI-LABEL: name: empty_array_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1220,9 +1220,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: empty_array_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1244,9 +1244,9 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, {i32, i64} %arg1) { ; HSA-VI-LABEL: name: struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1272,9 +1272,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, ; ; LEGACY-MESA-VI-LABEL: name: struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1312,9 +1312,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad, define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr addrspace(1)} %arg0, i8 %pad, {ptr addrspace(3), ptr addrspace(1234)} %arg1) { ; HSA-VI-LABEL: name: pointer_in_struct_argument ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1340,9 +1340,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add ; ; LEGACY-MESA-VI-LABEL: name: pointer_in_struct_argument ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1382,9 +1382,9 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; HSA-VI-LABEL: name: packed_struct_argument_alignment ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 16, addrspace 4) @@ -1406,9 +1406,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; LEGACY-MESA-VI-LABEL: name: packed_struct_argument_alignment ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) @@ -1441,16 +1441,16 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @unused_i32_arg(ptr addrspace(1) nocapture %out, i32 %unused, i32 %in) nounwind { ; HSA-VI-LABEL: name: unused_i32_arg ; HSA-VI: bb.1.entry: - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: S_ENDPGM 0 ; ; LEGACY-MESA-VI-LABEL: name: unused_i32_arg ; LEGACY-MESA-VI: bb.1.entry: - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: S_ENDPGM 0 entry: ret void @@ -1460,9 +1460,9 @@ entry: define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1475,9 +1475,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1496,9 +1496,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) align 2 %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i16_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1511,9 +1511,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1532,9 +1532,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align 4 %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1550,9 +1550,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1574,9 +1574,9 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) align(16) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_constant_v4i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1592,9 +1592,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1616,9 +1616,9 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_align_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1634,9 +1634,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1658,9 +1658,9 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; HSA-VI: bb.1 (%ir-block.1): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1676,9 +1676,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.1): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1701,9 +1701,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_global_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1716,9 +1716,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1736,9 +1736,9 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_flat_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1751,9 +1751,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p ; ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1771,9 +1771,9 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_32bit_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1786,9 +1786,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1806,9 +1806,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_unknown_as_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1821,9 +1821,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % ; ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1842,9 +1842,9 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(3) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_local_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1857,9 +1857,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ; ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1877,9 +1877,9 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(4) %in0.byref, ptr addrspace(4) byref(i32) align(4) %in1.byref, i32 %after.offset) { ; HSA-VI-LABEL: name: multi_byref_constant_i32_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 16, addrspace 4) @@ -1899,9 +1899,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu ; ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p1), align 4, addrspace 4) @@ -1929,9 +1929,9 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) align(4) %in.byref) { ; HSA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref ; ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0 ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF @@ -1958,9 +1958,9 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; HSA-VI-LABEL: name: p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), align 16, addrspace 4) @@ -1970,9 +1970,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (p3), addrspace 4) @@ -1986,9 +1986,9 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind { define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; HSA-VI-LABEL: name: p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; HSA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -1996,9 +1996,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 9 ; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0 ; LEGACY-MESA-VI-NEXT: G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3) @@ -2010,9 +2010,9 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind { define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), addrspace 4) @@ -2022,9 +2022,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p1>), align 4, addrspace 4) @@ -2038,9 +2038,9 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind { define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; HSA-VI-LABEL: name: v2p3i8_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 16, addrspace 4) @@ -2050,9 +2050,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { ; ; LEGACY-MESA-VI-LABEL: name: v2p3i8_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x p3>), align 4, addrspace 4) @@ -2066,9 +2066,9 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind { define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x ptr addrspace(3)> } %arg) nounwind { ; HSA-VI-LABEL: name: v2p1i8_in_struct_arg ; HSA-VI: bb.1 (%ir-block.0): - ; HSA-VI-NEXT: liveins: $sgpr6_sgpr7 + ; HSA-VI-NEXT: liveins: $sgpr4_sgpr5 ; HSA-VI-NEXT: {{ $}} - ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 + ; HSA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; HSA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; HSA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; HSA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), addrspace 4) @@ -2084,9 +2084,9 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p ; ; LEGACY-MESA-VI-LABEL: name: v2p1i8_in_struct_arg ; LEGACY-MESA-VI: bb.1 (%ir-block.0): - ; LEGACY-MESA-VI-NEXT: liveins: $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: liveins: $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: {{ $}} - ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 + ; LEGACY-MESA-VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; LEGACY-MESA-VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; LEGACY-MESA-VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll index 6b0e9618754df8..eebbe20abd043e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll @@ -4,9 +4,6 @@ define amdgpu_kernel void @system_one_as_acquire() { ; CHECK-LABEL: name: system_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acquire @@ -16,9 +13,6 @@ define amdgpu_kernel void @system_one_as_acquire() { define amdgpu_kernel void @system_one_as_release() { ; CHECK-LABEL: name: system_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") release @@ -28,9 +22,6 @@ define amdgpu_kernel void @system_one_as_release() { define amdgpu_kernel void @system_one_as_acq_rel() { ; CHECK-LABEL: name: system_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") acq_rel @@ -40,9 +31,6 @@ define amdgpu_kernel void @system_one_as_acq_rel() { define amdgpu_kernel void @system_one_as_seq_cst() { ; CHECK-LABEL: name: system_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 2 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("one-as") seq_cst @@ -52,9 +40,6 @@ define amdgpu_kernel void @system_one_as_seq_cst() { define amdgpu_kernel void @singlethread_one_as_acquire() { ; CHECK-LABEL: name: singlethread_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acquire @@ -64,9 +49,6 @@ define amdgpu_kernel void @singlethread_one_as_acquire() { define amdgpu_kernel void @singlethread_one_as_release() { ; CHECK-LABEL: name: singlethread_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") release @@ -76,9 +58,6 @@ define amdgpu_kernel void @singlethread_one_as_release() { define amdgpu_kernel void @singlethread_one_as_acq_rel() { ; CHECK-LABEL: name: singlethread_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") acq_rel @@ -88,9 +67,6 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel() { define amdgpu_kernel void @singlethread_one_as_seq_cst() { ; CHECK-LABEL: name: singlethread_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 3 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread-one-as") seq_cst @@ -100,9 +76,6 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst() { define amdgpu_kernel void @agent_one_as_acquire() { ; CHECK-LABEL: name: agent_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acquire @@ -112,9 +85,6 @@ define amdgpu_kernel void @agent_one_as_acquire() { define amdgpu_kernel void @agent_one_as_release() { ; CHECK-LABEL: name: agent_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") release @@ -124,9 +94,6 @@ define amdgpu_kernel void @agent_one_as_release() { define amdgpu_kernel void @agent_one_as_acq_rel() { ; CHECK-LABEL: name: agent_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") acq_rel @@ -136,9 +103,6 @@ define amdgpu_kernel void @agent_one_as_acq_rel() { define amdgpu_kernel void @agent_one_as_seq_cst() { ; CHECK-LABEL: name: agent_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 4 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent-one-as") seq_cst @@ -148,9 +112,6 @@ define amdgpu_kernel void @agent_one_as_seq_cst() { define amdgpu_kernel void @workgroup_one_as_acquire() { ; CHECK-LABEL: name: workgroup_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acquire @@ -160,9 +121,6 @@ define amdgpu_kernel void @workgroup_one_as_acquire() { define amdgpu_kernel void @workgroup_one_as_release() { ; CHECK-LABEL: name: workgroup_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") release @@ -172,9 +130,6 @@ define amdgpu_kernel void @workgroup_one_as_release() { define amdgpu_kernel void @workgroup_one_as_acq_rel() { ; CHECK-LABEL: name: workgroup_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") acq_rel @@ -184,9 +139,6 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() { define amdgpu_kernel void @workgroup_one_as_seq_cst() { ; CHECK-LABEL: name: workgroup_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 5 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup-one-as") seq_cst @@ -196,9 +148,6 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() { define amdgpu_kernel void @wavefront_one_as_acquire() { ; CHECK-LABEL: name: wavefront_one_as_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acquire @@ -208,9 +157,6 @@ define amdgpu_kernel void @wavefront_one_as_acquire() { define amdgpu_kernel void @wavefront_one_as_release() { ; CHECK-LABEL: name: wavefront_one_as_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") release @@ -220,9 +166,6 @@ define amdgpu_kernel void @wavefront_one_as_release() { define amdgpu_kernel void @wavefront_one_as_acq_rel() { ; CHECK-LABEL: name: wavefront_one_as_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") acq_rel @@ -232,9 +175,6 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel() { define amdgpu_kernel void @wavefront_one_as_seq_cst() { ; CHECK-LABEL: name: wavefront_one_as_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 6 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront-one-as") seq_cst @@ -244,9 +184,6 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst() { define amdgpu_kernel void @system_acquire() { ; CHECK-LABEL: name: system_acquire ; CHECK: bb.1.entry: - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: S_ENDPGM 0 entry: ret void @@ -255,9 +192,6 @@ entry: define amdgpu_kernel void @system_release() { ; CHECK-LABEL: name: system_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 1 ; CHECK-NEXT: S_ENDPGM 0 fence release @@ -267,9 +201,6 @@ define amdgpu_kernel void @system_release() { define amdgpu_kernel void @system_acq_rel() { ; CHECK-LABEL: name: system_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 1 ; CHECK-NEXT: S_ENDPGM 0 fence acq_rel @@ -279,9 +210,6 @@ define amdgpu_kernel void @system_acq_rel() { define amdgpu_kernel void @system_seq_cst() { ; CHECK-LABEL: name: system_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 1 ; CHECK-NEXT: S_ENDPGM 0 fence seq_cst @@ -291,9 +219,6 @@ define amdgpu_kernel void @system_seq_cst() { define amdgpu_kernel void @singlethread_acquire() { ; CHECK-LABEL: name: singlethread_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acquire @@ -303,9 +228,6 @@ define amdgpu_kernel void @singlethread_acquire() { define amdgpu_kernel void @singlethread_release() { ; CHECK-LABEL: name: singlethread_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") release @@ -315,9 +237,6 @@ define amdgpu_kernel void @singlethread_release() { define amdgpu_kernel void @singlethread_acq_rel() { ; CHECK-LABEL: name: singlethread_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") acq_rel @@ -327,9 +246,6 @@ define amdgpu_kernel void @singlethread_acq_rel() { define amdgpu_kernel void @singlethread_seq_cst() { ; CHECK-LABEL: name: singlethread_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 0 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("singlethread") seq_cst @@ -339,9 +255,6 @@ define amdgpu_kernel void @singlethread_seq_cst() { define amdgpu_kernel void @agent_acquire() { ; CHECK-LABEL: name: agent_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acquire @@ -351,9 +264,6 @@ define amdgpu_kernel void @agent_acquire() { define amdgpu_kernel void @agent_release() { ; CHECK-LABEL: name: agent_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") release @@ -363,9 +273,6 @@ define amdgpu_kernel void @agent_release() { define amdgpu_kernel void @agent_acq_rel() { ; CHECK-LABEL: name: agent_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") acq_rel @@ -375,9 +282,6 @@ define amdgpu_kernel void @agent_acq_rel() { define amdgpu_kernel void @agent_seq_cst() { ; CHECK-LABEL: name: agent_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 7 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("agent") seq_cst @@ -387,9 +291,6 @@ define amdgpu_kernel void @agent_seq_cst() { define amdgpu_kernel void @workgroup_acquire() { ; CHECK-LABEL: name: workgroup_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acquire @@ -399,9 +300,6 @@ define amdgpu_kernel void @workgroup_acquire() { define amdgpu_kernel void @workgroup_release() { ; CHECK-LABEL: name: workgroup_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") release @@ -411,9 +309,6 @@ define amdgpu_kernel void @workgroup_release() { define amdgpu_kernel void @workgroup_acq_rel() { ; CHECK-LABEL: name: workgroup_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") acq_rel @@ -423,9 +318,6 @@ define amdgpu_kernel void @workgroup_acq_rel() { define amdgpu_kernel void @workgroup_seq_cst() { ; CHECK-LABEL: name: workgroup_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 8 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("workgroup") seq_cst @@ -435,9 +327,6 @@ define amdgpu_kernel void @workgroup_seq_cst() { define amdgpu_kernel void @wavefront_acquire() { ; CHECK-LABEL: name: wavefront_acquire ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 4, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acquire @@ -447,9 +336,6 @@ define amdgpu_kernel void @wavefront_acquire() { define amdgpu_kernel void @wavefront_release() { ; CHECK-LABEL: name: wavefront_release ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 5, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") release @@ -459,9 +345,6 @@ define amdgpu_kernel void @wavefront_release() { define amdgpu_kernel void @wavefront_acq_rel() { ; CHECK-LABEL: name: wavefront_acq_rel ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 6, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") acq_rel @@ -471,9 +354,6 @@ define amdgpu_kernel void @wavefront_acq_rel() { define amdgpu_kernel void @wavefront_seq_cst() { ; CHECK-LABEL: name: wavefront_seq_cst ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr6_sgpr7 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr6_sgpr7 ; CHECK-NEXT: G_FENCE 7, 9 ; CHECK-NEXT: S_ENDPGM 0 fence syncscope("wavefront") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll index 8813462652efdb..ecad793ad58987 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll @@ -810,14 +810,14 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -932,14 +932,14 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX35]], [[C3]](s32) ; GCN-NEXT: G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store (s64) into %ir.alloca1 + 8, addrspace 5) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_multi_byval - ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[FRAME_INDEX36:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 @@ -978,14 +978,14 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1096,14 +1096,14 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32) ; GCN-NEXT: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store (s32) into %ir.alloca + 8, addrspace 5) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed - ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[FRAME_INDEX35:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 @@ -1170,26 +1170,26 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i64_fastcc_i64 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1211,23 +1211,50 @@ entry: ret i64 %ret } -declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) #1 +declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0) define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 { ; GCN-LABEL: name: sibling_call_p1i8_fastcc_p1i8 ; GCN: bb.1.entry: - ; GCN-NEXT: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY9]](s32), [[COPY10]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @p1i8_fastcc_p1i8 + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) ; GCN-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) - ; GCN-NEXT: SI_TCRETURN [[GV]](p0), @p1i8_fastcc_p1i8, 0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) + ; GCN-NEXT: $sgpr6_sgpr7 = COPY [[COPY12]](p4) + ; GCN-NEXT: $sgpr8_sgpr9 = COPY [[COPY13]](p4) + ; GCN-NEXT: $sgpr10_sgpr11 = COPY [[COPY14]](s64) + ; GCN-NEXT: $sgpr12 = COPY [[COPY15]](s32) + ; GCN-NEXT: $sgpr13 = COPY [[COPY16]](s32) + ; GCN-NEXT: $sgpr14 = COPY [[COPY17]](s32) + ; GCN-NEXT: $sgpr15 = COPY [[COPY18]](s32) + ; GCN-NEXT: $vgpr31 = COPY [[COPY19]](s32) + ; GCN-NEXT: SI_TCRETURN [[GV]](p0), @p1i8_fastcc_p1i8, 0, csr_amdgpu, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 entry: %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a) ret ptr addrspace(1) %ret @@ -1241,25 +1268,25 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @i16_fastcc_i16 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -1288,25 +1315,25 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @f16_fastcc_f16 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) @@ -1335,28 +1362,28 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v3i16_fastcc_v3i16 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s16>) ; GCN-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF @@ -1389,26 +1416,26 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v4i16_fastcc_v4i16 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) ; GCN-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) @@ -1438,14 +1465,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 ; GCN-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32(s32) = COPY $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64(s64) = COPY $sgpr10_sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr6_sgpr7 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1454,14 +1481,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 ; GCN-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY11]](s32), [[COPY12]](s32) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GCN-NEXT: [[GV:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @v2i64_fastcc_v2i64 - ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]](p4) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY8]] ; GCN-NEXT: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY7]](p4) - ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]](s64) - ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) - ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) - ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN-NEXT: [[COPY15:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; GCN-NEXT: [[COPY16:%[0-9]+]]:_(s64) = COPY [[COPY5]] + ; GCN-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; GCN-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; GCN-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s64>) ; GCN-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1486,7 +1513,7 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind noinline } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index c3938e673a6da6..2f718814ef77b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CHECK-LABEL: use_lds_globals: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 ; CHECK-NEXT: ds_read_b32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll index b8b7256011df89..7587aa0cad2d4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: not llc -mtriple=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s @@ -11,25 +11,25 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.1 (%ir-block.0): - ; GCN: liveins: $sgpr2_sgpr3 - ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40 + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds - ; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc - ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 - ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] + ; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc + ; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 + ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] ; GCN: $m0 = S_MOV_B32 -1 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX8: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 0, 0, implicit $m0, implicit $exec - ; GFX9: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 40, 0, implicit $m0, implicit $exec - ; GFX8: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 - ; GFX8: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 - ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX8: BUFFER_STORE_DWORD_OFFSET [[DS_READ_B32_]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec - ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; GFX9: FLAT_STORE_DWORD [[COPY2]], [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 0, 0, implicit $m0, implicit $exec + ; GFX8: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 40, 0, implicit $m0, implicit $exec + ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX6: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: BUFFER_STORE_DWORD_OFFSET [[DS_READ_B32_]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec + ; GFX8: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX8: FLAT_STORE_DWORD [[COPY2]], [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr ; GCN: S_ENDPGM 0 %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 %ld = load i32, ptr addrspace(3) %gep diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 90f34acaa17aae..1a49a38158122e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -48,7 +48,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -62,9 +62,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f32_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -93,7 +91,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -112,7 +110,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f32_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -133,7 +131,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -147,9 +145,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_div_scale_f32_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -178,7 +174,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -188,7 +184,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -197,11 +193,12 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f64_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 @@ -210,9 +207,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -220,35 +215,31 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -269,7 +260,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -279,7 +270,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -288,11 +279,12 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: test_div_scale_f64_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 @@ -301,9 +293,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -311,35 +301,31 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -360,8 +346,8 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -378,8 +364,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -396,10 +382,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -410,12 +395,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x54 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x54 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -438,8 +420,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) { ; GFX7-LABEL: test_div_scale_f32_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -456,8 +438,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -474,10 +456,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -488,12 +469,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -516,8 +494,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -534,8 +512,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -552,10 +530,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -566,12 +543,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -594,8 +568,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) { ; GFX7-LABEL: test_div_scale_f32_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -612,8 +586,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f32_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -630,10 +604,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] @@ -644,12 +617,9 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f32_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -672,8 +642,8 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -690,8 +660,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -708,10 +678,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -722,13 +691,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -751,8 +717,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) { ; GFX7-LABEL: test_div_scale_f64_scalar_num_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -769,8 +735,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -787,10 +753,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -801,13 +766,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_num_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -830,8 +792,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -848,8 +810,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -866,10 +828,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -880,13 +841,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -909,8 +867,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_scalar_den_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 @@ -927,8 +885,8 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -945,10 +903,9 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] @@ -959,13 +916,10 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; ; GFX11-LABEL: test_div_scale_f64_scalar_den_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -988,26 +942,25 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c -; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s4 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1016,24 +969,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s5, s5, s4 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4 +; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1047,26 +1000,25 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) { ; GFX7-LABEL: test_div_scale_f32_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x1c -; GFX7-NEXT: s_load_dword s5, s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s5, v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s4, v0, s4 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x70 -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s1, v0, s1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1075,24 +1027,24 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f32_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s5, s4 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f32_all_scalar_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1106,13 +1058,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1120,14 +1072,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[4:5] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1136,26 +1087,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[4:5], s[4:5], s[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_all_scalar_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[0:1] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1169,13 +1118,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) { ; GFX7-LABEL: test_div_scale_f64_all_scalar_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[6:7], v[0:1], s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1183,14 +1132,13 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[4:5], v[0:1], s[4:5] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1199,26 +1147,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; GFX10-LABEL: test_div_scale_f64_all_scalar_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: test_div_scale_f64_all_scalar_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x74 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[4:5], s[0:1] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1232,7 +1178,7 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1249,7 +1195,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1266,7 +1212,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1278,10 +1224,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1303,7 +1247,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_inline_imm_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1320,7 +1264,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX8-LABEL: test_div_scale_f32_inline_imm_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1337,7 +1281,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1349,10 +1293,8 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; ; GFX11-LABEL: test_div_scale_f32_inline_imm_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1374,7 +1316,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_num: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1394,7 +1336,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_num: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1416,7 +1358,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1431,9 +1373,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: test_div_scale_f32_fabs_num: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1466,7 +1406,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f32_fabs_den: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1486,7 +1426,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: test_div_scale_f32_fabs_den: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -1508,7 +1448,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1523,9 +1463,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: test_div_scale_f32_fabs_den: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1558,7 +1496,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 @@ -1570,8 +1508,8 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; GFX8-LABEL: test_div_scale_f32_val_undef_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1580,7 +1518,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 @@ -1589,7 +1527,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_val_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 @@ -1606,7 +1544,7 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_val_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 @@ -1618,8 +1556,8 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; GFX8-LABEL: test_div_scale_f32_undef_val_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1628,7 +1566,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 @@ -1637,7 +1575,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_div_scale_f32_undef_val_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 @@ -1654,7 +1592,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f32_undef_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -1664,8 +1602,8 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX8-LABEL: test_div_scale_f32_undef_undef_val: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1674,7 +1612,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX10-LABEL: test_div_scale_f32_undef_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 @@ -1683,7 +1621,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % ; ; GFX11-LABEL: test_div_scale_f32_undef_undef_val: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 @@ -1700,7 +1638,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f64_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x40200000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1714,8 +1652,8 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1724,8 +1662,8 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; ; GFX10-LABEL: test_div_scale_f64_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], 0x40200000 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1734,7 +1672,7 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou ; GFX11-LABEL: test_div_scale_f64_val_undef_val: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll index 2a260823732ca9..d7b7f03d428bfb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 @@ -14,7 +14,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB0_2: ; %bb -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x24 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -25,16 +25,16 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %mid ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB0_2: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll index 06393857352b3a..81d8472ebd46ef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index 59818b0b1bc39b..ade6e55b482bb7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -147,10 +147,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -160,7 +160,7 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN @@ -179,8 +179,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -190,16 +190,16 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll index de91c45000f137..752ddbb896c6b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -61,8 +61,8 @@ define void @global_atomic_fadd_f32_off_neg2047(ptr addrspace(1) %ptr, float %da define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, float %data) { ; GFX908-LABEL: global_atomic_fadd_f32_off_ss: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v0, s2 @@ -71,8 +71,8 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll index ec069c10a8d212..1e0cbde7df0dbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x24 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s1, s[4:5], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 @@ -22,14 +22,14 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: s_or_b32 s0, s1, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index d7a82b415ff06c..9718cef5c6db0e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[6:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xa +; GCN-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 69f9a5712b0b5a..546376c5962be7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -658,7 +658,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 @@ -688,33 +688,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 -; GFX11-NEXT: s_mov_b32 s9, 4.0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -742,7 +742,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 @@ -769,7 +769,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) @@ -796,29 +796,28 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -847,8 +846,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -876,8 +875,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -889,8 +888,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] @@ -904,30 +903,29 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7 -; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_movk_i32 s17, 0x102 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v9, s16 :: v_dual_lshlrev_b32 v2, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 ; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX11-NEXT: v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_mov_b32_e32 v4, s9 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s16 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v10, s17 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v11, v[0:1] @@ -959,8 +957,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -985,8 +983,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 @@ -995,8 +993,8 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v0, s2 +; GFX1013-NEXT: v_mov_b32_e32 v1, s3 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: flat_load_dword v2, v[0:1] @@ -1010,24 +1008,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6 -; GFX11-NEXT: s_mov_b32 s6, 2.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_movk_i32 s13, 0x102 +; GFX11-NEXT: s_mov_b32 s6, 2.0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_load_b32 v8, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll index b0c6e89380d810..5c22d5bdcf7449 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-LABEL: is_private_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x32 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x32 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -39,7 +39,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_private_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -53,14 +53,13 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_private_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -79,9 +78,9 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; CI-LABEL: is_private_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[6:7], 0x32 +; CI-NEXT: s_load_dword s0, s[4:5], 0x32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cbranch_scc1 .LBB1_2 @@ -94,7 +93,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX9-LABEL: is_private_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s1, s3 @@ -108,7 +107,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -122,7 +121,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_private_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll index bbcb807a956bee..e005c38355a3ce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-LABEL: is_local_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x33 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x33 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -39,7 +39,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_local_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -53,14 +53,13 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_local_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -79,9 +78,9 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; CI-LABEL: is_local_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s0, s[6:7], 0x33 +; CI-NEXT: s_load_dword s0, s[4:5], 0x33 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_lg_u32 s1, s0 ; CI-NEXT: s_cbranch_scc1 .LBB1_2 @@ -94,7 +93,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX9-LABEL: is_local_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u32 s1, s3 @@ -108,7 +107,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s1, s3 @@ -122,7 +121,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_local_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll index 1676b69c8c6318..7fc9842824b01d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -4,9 +4,9 @@ ; ALL-LABEL: {{^}}test: ; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; CO-V4: s_load_dword s{{[0-9]+}}, s[8:9], 0xa +; CO-V4: s_load_dword s{{[0-9]+}}, s[4:5], 0xa -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0xa +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa ; HSA: .amdhsa_kernarg_size 8 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -81,7 +81,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(ptr addrspace(1) %out, ; HSA: .amdhsa_kernarg_size 0 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @test_no_kernargs() #4 { +define amdgpu_kernel void @test_no_kernargs() #1 { %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10 %value = load i32, ptr addrspace(4) %gep @@ -126,7 +126,6 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "amdgpu-implicitarg-num-bytes"="0" } attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" } -attributes #4 = { nounwind "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index 4d012796693cb6..e7faabb72ab691 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[36:37], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s36, 2 @@ -81,7 +81,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -127,7 +127,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[6:7], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s6, 2 @@ -157,7 +157,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 @@ -204,7 +204,7 @@ bb: define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[6:7], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s6, 2 @@ -235,11 +235,11 @@ bb: define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_4x4x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; GCN-NEXT: s_nop 3 @@ -258,8 +258,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -292,11 +292,11 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 @@ -317,8 +317,8 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0 ; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -352,9 +352,9 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-NEXT: s_mov_b32 s5, 0x405ec000 ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index aa21e67544d65d..c0cd0686072002 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 @@ -22,22 +22,22 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c ; encoding: [0x01,0x01,0x00,0xf4,0x2c,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; encoding: [0x01,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01] ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] @@ -50,7 +50,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; GFX8-LABEL: mov_dpp64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -63,7 +63,7 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX10-LABEL: mov_dpp64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] @@ -75,7 +75,7 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; ; GFX11-LABEL: mov_dpp64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; encoding: [0x01,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll index dd351e193e9e6e..1eb0c2a8774258 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll @@ -2,20 +2,10 @@ ; FIXME: Error on non-hsa target -; GCN-LABEL: {{^}}queue_ptr: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 -; GCN: .amdhsa_user_sgpr_queue_ptr 1 -define amdgpu_kernel void @queue_ptr(ptr addrspace(1) %out) { - %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 - %value = load i32, ptr addrspace(4) %queue_ptr - store i32 %value, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}queue_ptr_opt: +; GCN-LABEL: {{^}}test: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: .amdhsa_user_sgpr_queue_ptr 1 -define amdgpu_kernel void @queue_ptr_opt(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test(ptr addrspace(1) %out) { %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 %value = load i32, ptr addrspace(4) %queue_ptr store i32 %value, ptr addrspace(1) %out @@ -25,7 +15,6 @@ define amdgpu_kernel void @queue_ptr_opt(ptr addrspace(1) %out) #1 { declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-no-dispatch-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 5a4b4e62bd8ae5..6d4aa3b04d7612 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -62,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace(1) %src0) #0 { ; GFX6-LABEL: v_bfe_print_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -118,7 +118,7 @@ define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s4, 8 +; GFX6-NEXT: s_bfe_i32 s3, s3, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -173,7 +173,7 @@ define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -194,7 +194,7 @@ define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -215,7 +215,7 @@ define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -234,7 +234,7 @@ define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -272,7 +272,7 @@ define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -291,7 +291,7 @@ define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -311,7 +311,7 @@ define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_i32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -347,7 +347,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -379,7 +379,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -411,7 +411,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -428,7 +428,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -445,7 +445,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -462,7 +462,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_i32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -479,7 +479,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_i32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -496,7 +496,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -513,7 +513,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -530,7 +530,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -547,7 +547,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_i32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -564,7 +564,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_i32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -598,7 +598,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_i32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -614,7 +614,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -631,7 +631,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_i32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_i32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -648,7 +648,7 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_sext_in_reg_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -670,7 +670,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -694,7 +694,7 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_0_width: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -713,7 +713,7 @@ define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -734,7 +734,7 @@ define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -756,7 +756,7 @@ define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: bfe_16_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -778,7 +778,7 @@ define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 @@ -799,7 +799,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 8 @@ -820,7 +820,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -844,7 +844,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -890,7 +890,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -912,7 +912,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, pt define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 5074f8814546ea..0c60be9d94591a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,11 +4,11 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec @@ -23,7 +23,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -43,20 +43,20 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 -; GCN-NEXT: s_cselect_b32 s3, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_cselect_b32 s4, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_mov_b32 s2, 1 -; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 @@ -96,12 +96,12 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -116,7 +116,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0xcccccccd ; GCN-NEXT: s_mov_b32 s5, 0x4010cccc ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -140,12 +140,12 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -160,12 +160,12 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -180,7 +180,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -204,7 +204,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 1.0 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -228,12 +228,12 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_not_b64 exec, exec @@ -248,7 +248,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -272,7 +272,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -296,7 +296,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -320,7 +320,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -340,11 +340,11 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -359,11 +359,11 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -378,11 +378,11 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec @@ -397,11 +397,11 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index f3654fea486e0c..1d5cc1e1ec0463 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -40,8 +40,8 @@ define double @v_trig_preop_f64_imm(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -57,8 +57,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -86,8 +86,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -97,10 +97,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX11-LABEL: s_trig_preop_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], s2 +; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[2:3], s0 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -127,7 +127,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -142,7 +142,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -160,7 +160,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX11-LABEL: s_trig_preop_f64_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX11-NEXT: flat_store_b64 v[0:1], v[0:1] dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index d7fbec74af3858..43a0f018dc1cd2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -44,7 +44,7 @@ define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i3 define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s3, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -63,7 +63,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 @@ -81,7 +81,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 @@ -99,7 +99,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -118,7 +118,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 @@ -135,11 +135,11 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_imm_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 8 +; GFX6-NEXT: s_bfe_u32 s3, s3, 8 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zextload_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -197,7 +197,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -220,7 +220,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -243,7 +243,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -266,7 +266,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -289,7 +289,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -331,7 +331,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -352,7 +352,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -373,7 +373,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -438,7 +438,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -459,7 +459,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -499,7 +499,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -518,7 +518,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -537,7 +537,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -557,7 +557,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -577,7 +577,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: bfe_u32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -597,7 +597,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -613,7 +613,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0x302e, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -629,7 +629,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -645,7 +645,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, 1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -661,7 +661,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -677,7 +677,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x10007 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -694,7 +694,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x80, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -711,7 +711,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80000 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -728,7 +728,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80006 ; GFX6-NEXT: s_bfe_u32 s2, 0x7f, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -745,7 +745,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x80010 ; GFX6-NEXT: s_bfe_u32 s2, 0x10000, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -762,7 +762,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0xffff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -779,7 +779,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x40004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -796,7 +796,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -813,7 +813,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x100010 ; GFX6-NEXT: s_bfe_u32 s2, 0x1fffe, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -830,7 +830,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1e0002 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -847,7 +847,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1c0004 ; GFX6-NEXT: s_bfe_u32 s2, 0xa0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -864,7 +864,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_bfe_u32 s2, -1, 0x70001 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -880,7 +880,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_17: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1f0001 ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -897,7 +897,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: bfe_u32_constant_fold_test_18: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, 0x1001f ; GFX6-NEXT: s_bfe_u32 s2, 0xff, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -918,8 +918,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -947,11 +947,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -965,7 +965,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: v_lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s3, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s3, 7 @@ -983,11 +983,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1001,11 +1001,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: and_lshr2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x30006 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1019,11 +1019,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; GFX6-LABEL: shl_lshr: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s3, s4, 0x150002 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x150002 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 2198ba9f1d964d..727184a36c0067 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -19,7 +19,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -30,7 +30,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -46,7 +46,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) { ; GFX8-LABEL: update_dppi64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX10-LABEL: update_dppi64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -78,11 +78,10 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; ; GFX11-LABEL: update_dppi64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -102,7 +101,7 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) { ; GFX8-LABEL: update_dppf64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -120,7 +119,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX10-LABEL: update_dppf64_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -134,11 +133,10 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; ; GFX11-LABEL: update_dppf64_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -158,7 +156,7 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { ; GFX8-LABEL: update_dppv2i32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -176,7 +174,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -190,11 +188,10 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; ; GFX11-LABEL: update_dppv2i32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -214,7 +211,7 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { ; GFX8-LABEL: update_dppv2f32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -232,7 +229,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -246,11 +243,10 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; ; GFX11-LABEL: update_dppv2f32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -270,7 +266,7 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p0_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -288,7 +284,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX10-LABEL: update_dpp_p0_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] @@ -302,11 +298,10 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; ; GFX11-LABEL: update_dpp_p0_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 @@ -326,7 +321,7 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) { ; GFX8-LABEL: update_dpp_p3_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -341,7 +336,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p3_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -354,8 +349,7 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa ; ; GFX11-LABEL: update_dpp_p3_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -377,11 +371,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; GFX8-LABEL: update_dpp_p5_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s90, -1 ; GFX8-NEXT: s_mov_b32 s91, 0xe80000 -; GFX8-NEXT: s_add_u32 s88, s88, s9 +; GFX8-NEXT: s_add_u32 s88, s88, s3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -396,27 +390,26 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa ; ; GFX10-LABEL: update_dpp_p5_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s14, -1 -; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-NEXT: s_add_u32 s12, s12, s9 -; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GFX10-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dpp_p5_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll index b2546700a935db..df201c1903b642 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workgroup.id.ll @@ -1,8 +1,7 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll index d5646820a19832..09882c446fc0fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -1,14 +1,12 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v4.ll -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v6.ll -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs| FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -verify-machineinstrs -amdgpu-enable-vopd=0 | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index f555d53462c182..449eb0d9cdfc2f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-LABEL: localize_constants: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 @@ -95,7 +95,7 @@ bb2: define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-LABEL: localize_globals: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s1, s1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 51d070456b90c2..2f0453bba0d79e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -10,7 +10,7 @@ ; Note: we use MIR test checks + stop after legalizer to prevent ; tests from being optimized out. -define amdgpu_kernel void @system_one_as_acquire() #0 { +define amdgpu_kernel void @system_one_as_acquire() { ; GFX6-LABEL: name: system_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -71,7 +71,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_release() #0 { +define amdgpu_kernel void @system_one_as_release() { ; GFX6-LABEL: name: system_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -122,7 +122,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_acq_rel() #0 { +define amdgpu_kernel void @system_one_as_acq_rel() { ; GFX6-LABEL: name: system_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -183,7 +183,7 @@ entry: ret void } -define amdgpu_kernel void @system_one_as_seq_cst() #0 { +define amdgpu_kernel void @system_one_as_seq_cst() { ; GFX6-LABEL: name: system_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -244,7 +244,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_acquire() #0 { +define amdgpu_kernel void @singlethread_one_as_acquire() { ; GFX6-LABEL: name: singlethread_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -285,7 +285,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_release() #0 { +define amdgpu_kernel void @singlethread_one_as_release() { ; GFX6-LABEL: name: singlethread_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -326,7 +326,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_acq_rel() #0 { +define amdgpu_kernel void @singlethread_one_as_acq_rel() { ; GFX6-LABEL: name: singlethread_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -367,7 +367,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_one_as_seq_cst() #0 { +define amdgpu_kernel void @singlethread_one_as_seq_cst() { ; GFX6-LABEL: name: singlethread_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -408,7 +408,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_acquire() #0 { +define amdgpu_kernel void @agent_one_as_acquire() { ; GFX6-LABEL: name: agent_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -469,7 +469,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_release() #0 { +define amdgpu_kernel void @agent_one_as_release() { ; GFX6-LABEL: name: agent_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -520,7 +520,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_acq_rel() #0 { +define amdgpu_kernel void @agent_one_as_acq_rel() { ; GFX6-LABEL: name: agent_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -581,7 +581,7 @@ entry: ret void } -define amdgpu_kernel void @agent_one_as_seq_cst() #0 { +define amdgpu_kernel void @agent_one_as_seq_cst() { ; GFX6-LABEL: name: agent_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -642,7 +642,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_acquire() #0 { +define amdgpu_kernel void @workgroup_one_as_acquire() { ; GFX6-LABEL: name: workgroup_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -689,7 +689,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_release() #0 { +define amdgpu_kernel void @workgroup_one_as_release() { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -734,7 +734,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { +define amdgpu_kernel void @workgroup_one_as_acq_rel() { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -781,7 +781,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { +define amdgpu_kernel void @workgroup_one_as_seq_cst() { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -828,7 +828,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_acquire() #0 { +define amdgpu_kernel void @wavefront_one_as_acquire() { ; GFX6-LABEL: name: wavefront_one_as_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -869,7 +869,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_release() #0 { +define amdgpu_kernel void @wavefront_one_as_release() { ; GFX6-LABEL: name: wavefront_one_as_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -910,7 +910,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_acq_rel() #0 { +define amdgpu_kernel void @wavefront_one_as_acq_rel() { ; GFX6-LABEL: name: wavefront_one_as_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -951,7 +951,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_one_as_seq_cst() #0 { +define amdgpu_kernel void @wavefront_one_as_seq_cst() { ; GFX6-LABEL: name: wavefront_one_as_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -992,7 +992,7 @@ entry: ret void } -define amdgpu_kernel void @system_acquire() #0 { +define amdgpu_kernel void @system_acquire() { ; GFX6-LABEL: name: system_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1053,7 +1053,7 @@ entry: ret void } -define amdgpu_kernel void @system_release() #0 { +define amdgpu_kernel void @system_release() { ; GFX6-LABEL: name: system_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1104,7 +1104,7 @@ entry: ret void } -define amdgpu_kernel void @system_acq_rel() #0 { +define amdgpu_kernel void @system_acq_rel() { ; GFX6-LABEL: name: system_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1165,7 +1165,7 @@ entry: ret void } -define amdgpu_kernel void @system_seq_cst() #0 { +define amdgpu_kernel void @system_seq_cst() { ; GFX6-LABEL: name: system_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1226,7 +1226,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_acquire() #0 { +define amdgpu_kernel void @singlethread_acquire() { ; GFX6-LABEL: name: singlethread_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1267,7 +1267,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_release() #0 { +define amdgpu_kernel void @singlethread_release() { ; GFX6-LABEL: name: singlethread_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1308,7 +1308,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_acq_rel() #0 { +define amdgpu_kernel void @singlethread_acq_rel() { ; GFX6-LABEL: name: singlethread_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1349,7 +1349,7 @@ entry: ret void } -define amdgpu_kernel void @singlethread_seq_cst() #0 { +define amdgpu_kernel void @singlethread_seq_cst() { ; GFX6-LABEL: name: singlethread_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1390,7 +1390,7 @@ entry: ret void } -define amdgpu_kernel void @agent_acquire() #0 { +define amdgpu_kernel void @agent_acquire() { ; GFX6-LABEL: name: agent_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1451,7 +1451,7 @@ entry: ret void } -define amdgpu_kernel void @agent_release() #0 { +define amdgpu_kernel void @agent_release() { ; GFX6-LABEL: name: agent_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1502,7 +1502,7 @@ entry: ret void } -define amdgpu_kernel void @agent_acq_rel() #0 { +define amdgpu_kernel void @agent_acq_rel() { ; GFX6-LABEL: name: agent_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1563,7 +1563,7 @@ entry: ret void } -define amdgpu_kernel void @agent_seq_cst() #0 { +define amdgpu_kernel void @agent_seq_cst() { ; GFX6-LABEL: name: agent_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1624,7 +1624,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_acquire() #0 { +define amdgpu_kernel void @workgroup_acquire() { ; GFX6-LABEL: name: workgroup_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1675,7 +1675,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_release() #0 { +define amdgpu_kernel void @workgroup_release() { ; GFX6-LABEL: name: workgroup_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1724,7 +1724,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_acq_rel() #0 { +define amdgpu_kernel void @workgroup_acq_rel() { ; GFX6-LABEL: name: workgroup_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1775,7 +1775,7 @@ entry: ret void } -define amdgpu_kernel void @workgroup_seq_cst() #0 { +define amdgpu_kernel void @workgroup_seq_cst() { ; GFX6-LABEL: name: workgroup_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1826,7 +1826,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_acquire() #0 { +define amdgpu_kernel void @wavefront_acquire() { ; GFX6-LABEL: name: wavefront_acquire ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1867,7 +1867,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_release() #0 { +define amdgpu_kernel void @wavefront_release() { ; GFX6-LABEL: name: wavefront_release ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1908,7 +1908,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_acq_rel() #0 { +define amdgpu_kernel void @wavefront_acq_rel() { ; GFX6-LABEL: name: wavefront_acq_rel ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1949,7 +1949,7 @@ entry: ret void } -define amdgpu_kernel void @wavefront_seq_cst() #0 { +define amdgpu_kernel void @wavefront_seq_cst() { ; GFX6-LABEL: name: wavefront_seq_cst ; GFX6: bb.0.entry: ; GFX6-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 @@ -1989,5 +1989,3 @@ entry: fence syncscope("wavefront") seq_cst ret void } - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index d133a67efcd655..b8d93a6dabbf1c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -23,9 +23,7 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_mul_i64_no_zext: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -58,13 +56,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX10-NEXT: global_load_dword v4, v3, s[0:1] +; GFX10-NEXT: global_load_dword v4, v3, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -77,10 +75,8 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_mul_i64_zext_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +108,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_mul_i64_zext_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -131,10 +127,8 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_mul_i64_zext_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -166,13 +160,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_zext_src0_src1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -182,12 +176,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_zext_src0_src1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] @@ -215,13 +207,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 @@ -234,10 +226,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src0_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -269,13 +259,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src0_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -286,10 +276,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src0_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -319,16 +307,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX10-LABEL: v_mul_i64_masked_src1_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: ; kill: killed $vgpr3 ; GFX10-NEXT: ; kill: killed $sgpr6_sgpr7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] -; GFX10-NEXT: ; kill: killed $sgpr0_sgpr1 +; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] +; GFX10-NEXT: ; kill: killed $sgpr2_sgpr3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -338,10 +326,8 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a ; GFX11-LABEL: v_mul_i64_masked_src1_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -369,7 +355,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul_i64_masked_src0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -379,7 +365,7 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_mul_i64_masked_src0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -403,13 +389,13 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-LABEL: v_mul_i64_partially_masked_src0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -426,10 +412,8 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-LABEL: v_mul_i64_partially_masked_src0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -466,7 +450,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; GFX10-LABEL: v_mul64_masked_before_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -476,7 +460,7 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_mul64_masked_before_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -514,13 +498,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX10-LABEL: v_mul64_masked_before_and_in_branch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] @@ -549,10 +533,8 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-LABEL: v_mul64_masked_before_and_in_branch: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index b0f3eee3c73632..2d81452f9ef38d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2542,7 +2542,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_zext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2559,7 +2559,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_zext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2576,7 +2576,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: s_mul_u64_zext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2590,7 +2590,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: s_mul_u64_zext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2604,7 +2604,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: s_mul_u64_zext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0 @@ -2619,7 +2619,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_zext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2718,7 +2718,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX7-LABEL: s_mul_u64_sext_with_sregs: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2738,7 +2738,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: s_mul_u64_sext_with_sregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: s_mul_u64_sext_with_sregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 @@ -2775,7 +2775,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: s_mul_u64_sext_with_sregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -2792,7 +2792,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: s_mul_u64_sext_with_sregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: s_mul_u64_sext_with_sregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index ab91b8ad3e8c91..3e278eb94e2e27 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -13,33 +13,33 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s4, s[6:7], 0xc +; GCN-NEXT: s_load_dword s6, s[4:5], 0xc ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dword s5, s[6:7], 0x10 -; GCN-NEXT: s_add_u32 s4, s32, 0x1000 +; GCN-NEXT: s_load_dword s7, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s5, s5, 2 -; GCN-NEXT: s_add_u32 s4, s4, s5 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_add_u32 s6, s6, s7 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -84,29 +84,29 @@ bb.2: define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { ; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[6:7], 0x8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_mov_b32 s33, 0 ; GCN-NEXT: s_movk_i32 s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s4, s[6:7], 0xc -; GCN-NEXT: s_add_u32 s5, s32, 0x1000 -; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s7, s32, 0x1000 +; GCN-NEXT: s_and_b32 s7, s7, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_lshl_b32 s6, s6, 2 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_add_u32 s6, s7, s6 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index cf69c50ed93572..b666f45521661c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -6,83 +6,83 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s8, s5, 31 -; GFX8-NEXT: s_add_i32 s0, s5, s8 -; GFX8-NEXT: s_xor_b32 s5, s0, s8 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: s_ashr_i32 s8, s7, 31 +; GFX8-NEXT: s_add_i32 s0, s7, s8 +; GFX8-NEXT: s_xor_b32 s7, s0, s8 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_ashr_i32 s4, s6, 31 +; GFX8-NEXT: s_add_i32 s5, s6, s4 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s7, s6, s8 +; GFX8-NEXT: s_xor_b32 s5, s5, s4 +; GFX8-NEXT: s_xor_b32 s6, s4, s8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s7, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -95,17 +95,16 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -113,17 +112,18 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -145,7 +145,7 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -305,7 +305,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 ; GFX9-NEXT: s_ashr_i32 s12, s11, 31 @@ -459,7 +459,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s9, 31 ; GFX10-NEXT: s_ashr_i32 s12, s11, 31 @@ -616,7 +616,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -692,7 +692,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s8, s6, 31 ; GFX9-NEXT: s_add_i32 s6, s6, s8 @@ -765,7 +765,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s1, s10, 31 ; GFX10-NEXT: s_ashr_i32 s2, s11, 31 @@ -845,8 +845,8 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s12, 31 ; GFX8-NEXT: s_add_i32 s0, s12, s2 @@ -986,19 +986,19 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s4 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_ashr_i32 s6, s13, 31 -; GFX9-NEXT: s_add_i32 s7, s13, s6 +; GFX9-NEXT: s_ashr_i32 s6, s12, 31 +; GFX9-NEXT: s_add_i32 s0, s12, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_ashr_i32 s4, s13, 31 +; GFX9-NEXT: s_add_i32 s5, s13, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_sub_i32 s13, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s8, s8, s12 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 +; GFX9-NEXT: s_sub_i32 s13, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 ; GFX9-NEXT: s_ashr_i32 s13, s9, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -1017,62 +1017,62 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX9-NEXT: s_add_i32 s9, s9, s13 ; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v2 -; GFX9-NEXT: s_xor_b32 s4, s12, s4 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 +; GFX9-NEXT: s_xor_b32 s6, s12, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: s_ashr_i32 s4, s14, 31 -; GFX9-NEXT: s_add_i32 s5, s14, s4 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: s_ashr_i32 s6, s14, 31 +; GFX9-NEXT: s_add_i32 s7, s14, s6 ; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 ; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s5 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s6, s13, s6 -; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 -; GFX9-NEXT: s_ashr_i32 s6, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s6 +; GFX9-NEXT: s_xor_b32 s4, s13, s4 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: s_ashr_i32 s4, s15, 31 +; GFX9-NEXT: s_add_i32 s9, s15, s4 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s6 +; GFX9-NEXT: s_xor_b32 s9, s9, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s5 +; GFX9-NEXT: s_xor_b32 s8, s8, s5 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 @@ -1080,27 +1080,27 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_sub_i32 s8, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: s_ashr_i32 s7, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s7 +; GFX9-NEXT: s_xor_b32 s8, s8, s7 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: s_xor_b32 s4, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc @@ -1112,12 +1112,12 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] @@ -1125,18 +1125,18 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s0, s12, 31 ; GFX10-NEXT: s_ashr_i32 s1, s13, 31 ; GFX10-NEXT: s_ashr_i32 s2, s14, 31 ; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s4, s12, s0 -; GFX10-NEXT: s_add_i32 s5, s13, s1 +; GFX10-NEXT: s_add_i32 s6, s12, s0 +; GFX10-NEXT: s_add_i32 s7, s13, s1 ; GFX10-NEXT: s_add_i32 s12, s14, s2 ; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s4, s0 -; GFX10-NEXT: s_xor_b32 s15, s5, s1 +; GFX10-NEXT: s_xor_b32 s14, s6, s0 +; GFX10-NEXT: s_xor_b32 s15, s7, s1 ; GFX10-NEXT: s_xor_b32 s12, s12, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX10-NEXT: s_xor_b32 s13, s13, s3 @@ -1144,11 +1144,11 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s14 +; GFX10-NEXT: s_sub_i32 s6, 0, s14 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s5, 0, s15 +; GFX10-NEXT: s_sub_i32 s7, 0, s15 ; GFX10-NEXT: s_sub_i32 s19, 0, s12 ; GFX10-NEXT: s_ashr_i32 s16, s8, 31 ; GFX10-NEXT: s_ashr_i32 s17, s9, 31 @@ -1163,22 +1163,22 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s5, v1 +; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s13 +; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s4, v3 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 ; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s4, s8, s16 -; GFX10-NEXT: s_add_i32 s5, s9, s17 +; GFX10-NEXT: s_add_i32 s6, s8, s16 +; GFX10-NEXT: s_add_i32 s7, s9, s17 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: s_add_i32 s8, s10, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 ; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s4, s16 -; GFX10-NEXT: s_xor_b32 s11, s5, s17 +; GFX10-NEXT: s_xor_b32 s10, s6, s16 +; GFX10-NEXT: s_xor_b32 s11, s7, s17 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s8, s8, s18 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 @@ -1271,8 +1271,8 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 ; GFX8-NEXT: s_ashr_i32 s6, s1, 31 @@ -1582,8 +1582,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 @@ -1885,8 +1885,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s16, s1, 31 ; GFX10-NEXT: s_ashr_i32 s4, s13, 31 @@ -2187,25 +2187,25 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x80008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i8 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2222,52 +2222,52 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2280,19 +2280,18 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: sdiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2300,17 +2299,18 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2332,14 +2332,14 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_xor_b32 s8, s0, s3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s4, 0, s8 +; GFX8-NEXT: s_sub_i32 s6, 0, s8 ; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80018 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2351,10 +2351,10 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s9, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s9 -; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s9 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2420,45 +2420,45 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80010 -; GFX9-NEXT: s_ashr_i32 s5, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_xor_b32 s8, s0, s5 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0x80010 +; GFX9-NEXT: s_ashr_i32 s7, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s7 +; GFX9-NEXT: s_xor_b32 s8, s0, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x80018 -; GFX9-NEXT: s_ashr_i32 s9, s7, 31 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0x80018 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s7, s7, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_xor_b32 s5, s5, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i8 s6, s4 +; GFX9-NEXT: s_sext_i32_i8 s4, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 -; GFX9-NEXT: s_sub_i32 s11, 0, s7 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x80008 -; GFX9-NEXT: s_ashr_i32 s11, s4, 31 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x80008 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s11 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 @@ -2469,25 +2469,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s5, s10, s5 -; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 +; GFX9-NEXT: s_xor_b32 s6, s10, s7 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 @@ -2505,7 +2505,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 ; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 @@ -2517,36 +2517,36 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: s_xor_b32 s3, s3, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s3 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 ; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 @@ -2596,25 +2596,25 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x100010 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,52 +2631,52 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x100010 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -2689,19 +2689,18 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: sdiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x100010 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 ; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s5, s1, s4 +; GFX10-NEXT: s_xor_b32 s7, s1, s6 ; GFX10-NEXT: s_xor_b32 s0, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s1, 0, s5 -; GFX10-NEXT: s_xor_b32 s4, s8, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2709,17 +2708,18 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s8, s6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2741,14 +2741,14 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s8 ; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: s_sub_i32 s4, 0, s9 +; GFX8-NEXT: s_sub_i32 s6, 0, s9 ; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 ; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2760,10 +2760,10 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: s_xor_b32 s0, s0, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2829,15 +2829,15 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: s_sext_i32_i16 s0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x100010 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s5, s5, s7 @@ -2847,27 +2847,27 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s10, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s6, s4 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 ; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x100010 -; GFX9-NEXT: s_ashr_i32 s11, s4, 31 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s11 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s1 ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 @@ -2924,36 +2924,36 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: s_xor_b32 s1, s1, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s4, 0, s2 +; GFX10-NEXT: s_sub_i32 s6, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s4, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_sext_i32_i16 s6, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX10-NEXT: s_ashr_i32 s9, s4, 31 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_add_i32 s4, s4, s9 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s4, s4, s9 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -3002,25 +3002,25 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s6, 0x30008 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x30000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x30000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3037,12 +3037,12 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 @@ -3052,39 +3052,39 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: sdivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x30000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3099,19 +3099,18 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: sdivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s0, 0x30008 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s5, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_xor_b32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s5 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s7, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s0, s0, s7 +; GFX10-NEXT: s_xor_b32 s1, s1, s6 +; GFX10-NEXT: s_xor_b32 s0, s0, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 -; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3129,14 +3128,15 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s7, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 7, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3153,25 +3153,25 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s5, s0, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_xor_b32 s8, s0, s5 +; GFX8-NEXT: s_bfe_i32 s0, s7, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x1b0000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX8-NEXT: s_ashr_i32 s6, s4, 31 -; GFX8-NEXT: s_add_i32 s4, s4, s6 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: s_xor_b32 s4, s4, s6 -; GFX8-NEXT: s_xor_b32 s5, s6, s5 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3188,12 +3188,12 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 @@ -3203,39 +3203,39 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: sdivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s1, 0x1b0000 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_xor_b32 s5, s1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s1, 0, s5 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s1, 0, s7 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0x1b0000 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 @@ -3250,19 +3250,18 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: sdivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_i32 s1, s1, 0x1b0000 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x1b0000 -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_ashr_i32 s5, s0, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_xor_b32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s0, s0, s5 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_ashr_i32 s7, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_add_i32 s0, s0, s7 +; GFX10-NEXT: s_xor_b32 s1, s1, s6 +; GFX10-NEXT: s_xor_b32 s0, s0, s7 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX10-NEXT: s_sub_i32 s2, 0, s1 -; GFX10-NEXT: s_xor_b32 s4, s5, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -3280,14 +3279,15 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_xor_b32 s4, s7, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 3729f1cc2b12d9..7ad19a47970039 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -238,7 +238,7 @@ define i64 @v_shl_i64_sext_i32_overflow(i32 %x) { define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX7-LABEL: mulu24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -251,7 +251,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX8-LABEL: mulu24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: mulu24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -281,7 +281,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX10-LABEL: mulu24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 6, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 @@ -296,7 +296,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; ; GFX11-LABEL: mulu24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 6, v0 ; GFX11-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] @@ -321,7 +321,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GFX7-LABEL: muli24_shl64: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -340,7 +340,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX8-LABEL: muli24_shl64: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +363,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX9-LABEL: muli24_shl64: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -378,7 +378,7 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX10-LABEL: muli24_shl64: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -393,17 +393,16 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; ; GFX11-LABEL: muli24_shl64: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_mul_i32_i24_e32 v0, -7, v0 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_or_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: v_mul_i32_i24_e32 v1, -7, v1 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 2d85081f5fc969..99aaec458c33ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -10,21 +10,21 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -38,22 +38,22 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -67,21 +67,21 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 @@ -123,8 +123,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 @@ -177,29 +177,29 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 16 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s2, s2, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s1, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s0, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s2, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 @@ -209,7 +209,7 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 @@ -234,8 +234,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 @@ -289,14 +289,14 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -317,8 +317,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s1, s4, 16 @@ -347,20 +347,20 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 ; GFX10-NEXT: s_lshr_b32 s3, s7, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 -; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v6, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: ds_write_b16 v1, v0 @@ -376,8 +376,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 @@ -404,11 +404,11 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -418,8 +418,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -434,11 +434,11 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 @@ -449,8 +449,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -465,21 +465,21 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -493,11 +493,11 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, s7 @@ -508,8 +508,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 @@ -523,21 +523,21 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: ds_write_b128 v4, v[0:3] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -551,22 +551,22 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index 4ef79b752c4373..0f9ec965f2f0f4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -10,20 +10,20 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -36,21 +36,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 @@ -63,21 +63,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 8 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 8 @@ -107,8 +107,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 @@ -150,32 +150,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 16 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s6, 16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: s_lshr_b32 s1, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s2, s2, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_lshr_b32 s0, s3, 8 +; GFX10-NEXT: s_lshr_b32 s3, s2, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v9, s3 -; GFX10-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, s0 +; GFX10-NEXT: v_mov_b32_e32 v10, s2 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s1 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 @@ -195,8 +195,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 @@ -237,14 +237,14 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -260,8 +260,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s1, s4, 16 @@ -285,18 +285,18 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s0, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s0, s5, 16 +; GFX10-NEXT: s_lshr_b32 s1, s5, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: s_lshr_b32 s2, s6, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: ds_write_b16 v1, v0 ; GFX10-NEXT: ds_write_b16 v1, v2 offset:4 @@ -309,8 +309,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s1, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 @@ -333,11 +333,11 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -346,8 +346,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -361,11 +361,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -375,8 +375,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -390,11 +390,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -403,8 +403,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -418,11 +418,11 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 @@ -432,8 +432,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 @@ -447,20 +447,20 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -473,21 +473,21 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 8b94f93e44e561..a58397eccaba76 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -6,32 +6,32 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s6, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -41,30 +41,30 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_sub_i32 s0, 0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -73,28 +73,28 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX10-NEXT: s_sub_i32 s0, 0, s5 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s0, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5 +; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -112,7 +112,7 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) { ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -251,7 +251,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -384,7 +384,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -522,7 +522,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -576,7 +576,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -627,7 +627,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 @@ -685,8 +685,8 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -783,7 +783,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -792,7 +792,6 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -800,7 +799,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_sub_i32 s4, 0, s14 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 @@ -878,9 +878,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -979,8 +979,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1510,7 +1510,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 @@ -1546,9 +1546,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4] ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 ; GFX10-NEXT: s_subb_u32 s3, 0, s15 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5] ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6] ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 @@ -1560,39 +1560,39 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v16, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5 +; GFX10-NEXT: v_add_co_u32 v6, s6, v6, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v11, s6, v13, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v2, s6, v16, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v6, s6, v11, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v1, s6, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v2, s6, v2, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_add_co_u32 v4, s6, v6, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v1, s4, v2, v1 +; GFX10-NEXT: v_add_co_u32 v1, s6, v2, v1 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo ; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4] @@ -1772,34 +1772,34 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) { ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x80008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -1809,32 +1809,32 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0xff +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] @@ -1843,12 +1843,12 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; ; GFX10-LABEL: udiv_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1856,17 +1856,17 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -1884,8 +1884,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) { ; GFX8-LABEL: udivrem_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -1949,55 +1949,55 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80010 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s5 +; GFX9-NEXT: s_sub_i32 s2, 0, s7 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s4 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2012,7 +2012,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -2029,8 +2029,8 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2081,34 +2081,34 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) { ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s4, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s6, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -2118,32 +2118,32 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0xffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v2, v0, s[0:1] @@ -2152,12 +2152,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; ; GFX10-LABEL: udiv_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2165,17 +2165,17 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2274,10 +2274,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2319,14 +2319,14 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2387,34 +2387,34 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) { ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX8-NEXT: s_bfe_u32 s7, s6, 0x30008 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2426,32 +2426,32 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX9-LABEL: udivrem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 7 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2462,12 +2462,12 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; ; GFX10-LABEL: udivrem_i3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x30008 +; GFX10-NEXT: s_bfe_u32 s6, s0, 0x30008 ; GFX10-NEXT: s_and_b32 s0, s0, 7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2475,17 +2475,17 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2505,34 +2505,34 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) { ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s0, 0, s5 -; GFX8-NEXT: s_and_b32 s4, s4, 0x7ffffff +; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2544,32 +2544,32 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s1, 0x7ffffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s1, 0, s4 -; GFX9-NEXT: s_and_b32 s5, s0, 0x7ffffff +; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2580,12 +2580,12 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff ; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s1, 0, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2593,17 +2593,17 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 03dc0444018cac..6b8b451c391b42 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v5, 16 @@ -18,7 +18,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] @@ -28,7 +28,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4 ; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 @@ -38,8 +38,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: global_store_short v1, v0, s[0:1] -; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[0:1] offset:2 +; GFX906-NEXT: global_store_short v1, v0, s[2:3] +; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[2:3] offset:2 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -61,21 +61,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v1, v2, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v1, v2, s[6:7] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -97,30 +97,30 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX906-NEXT: global_store_byte v4, v1, s[0:1] -; GFX906-NEXT: global_store_byte v4, v0, s[0:1] offset:1 -; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[0:1] offset:2 -; GFX906-NEXT: global_store_byte v4, v3, s[0:1] offset:3 -; GFX906-NEXT: global_store_byte v4, v2, s[0:1] offset:4 +; GFX906-NEXT: global_store_byte v4, v1, s[2:3] +; GFX906-NEXT: global_store_byte v4, v0, s[2:3] offset:1 +; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[2:3] offset:2 +; GFX906-NEXT: global_store_byte v4, v3, s[2:3] offset:3 +; GFX906-NEXT: global_store_byte v4, v2, s[2:3] offset:4 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -142,21 +142,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -178,21 +178,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -214,25 +214,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7] ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16 ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[2:3] offset:16 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -254,24 +254,24 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s10, -1 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] -; GFX906-NEXT: s_mov_b32 s14, -1 -; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 -; GFX906-NEXT: s_addc_u32 s13, s13, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32 @@ -288,16 +288,16 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48 @@ -314,13 +314,13 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240 ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill ; GFX906-NEXT: v_mov_b32_e32 v0, v57 ; GFX906-NEXT: v_mov_b32_e32 v1, v58 ; GFX906-NEXT: v_mov_b32_e32 v2, v59 @@ -377,34 +377,34 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_mov_b32_e32 v11, v7 ; GFX906-NEXT: v_mov_b32_e32 v10, v6 ; GFX906-NEXT: v_mov_b32_e32 v9, v5 -; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] -; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:16 -; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:32 -; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:48 -; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:64 -; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:80 -; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:96 -; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 -; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:128 -; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:144 -; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:160 -; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:176 -; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:192 -; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:208 -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:224 -; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[2:3] +; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[2:3] offset:16 +; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[2:3] offset:32 +; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[2:3] offset:48 +; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[2:3] offset:64 +; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[2:3] offset:80 +; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[2:3] offset:96 +; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[2:3] offset:112 +; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[2:3] offset:128 +; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[2:3] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[2:3] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[2:3] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[2:3] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[2:3] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -427,26 +427,26 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX906-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_cmp_lt_i32 s0, 3 +; GFX906-NEXT: s_cmp_lt_i32 s2, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 ; GFX906-NEXT: ; %bb.1: ; %LeafBlock -; GFX906-NEXT: s_cmp_ge_i32 s0, 1 +; GFX906-NEXT: s_cmp_ge_i32 s2, 1 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.2: ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX906-NEXT: global_load_dword v0, v0, s[4:5] ; GFX906-NEXT: s_branch .LBB7_5 ; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 -; GFX906-NEXT: s_cmp_eq_u32 s0, 3 +; GFX906-NEXT: s_cmp_eq_u32 s2, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 ; GFX906-NEXT: ; %bb.4: ; %sw.bb5 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX906-NEXT: global_load_dword v0, v0, s[6:7] ; GFX906-NEXT: .LBB7_5: ; %return.sink.split -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -479,7 +479,7 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +533,7 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -584,14 +584,14 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xff ; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v1, s[0:1] -; GFX906-NEXT: s_mov_b64 s[0:1], 0 +; GFX906-NEXT: global_load_dword v1, v1, s[2:3] +; GFX906-NEXT: s_mov_b64 s[2:3], 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 @@ -602,13 +602,13 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX906-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX906-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_cbranch_execnz .LBB10_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index ef2e57eafbf137..037210a496d6d6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i8_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX9-LABEL: constant_load_i8_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -29,7 +29,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a ; ; GFX10-LABEL: constant_load_i8_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -45,7 +45,7 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 { ; GFX8-LABEL: constant_load_i16_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX9-LABEL: constant_load_i16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr ; ; GFX10-LABEL: constant_load_i16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -97,7 +97,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -109,7 +109,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: sextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: sextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: sextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -152,7 +152,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: sextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -170,7 +170,7 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i8_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -183,7 +183,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i8_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -195,7 +195,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: zextload_i8_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -213,7 +213,7 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: zextload_i16_to_i32_align4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -226,7 +226,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: zextload_i16_to_i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -238,7 +238,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: zextload_i16_to_i32_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -269,7 +269,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: constant_load_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -279,7 +279,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: constant_load_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -294,7 +294,7 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_load_i16_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -307,7 +307,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: constant_load_i16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -317,7 +317,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: constant_load_i16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -332,7 +332,7 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_sextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -351,7 +351,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_sextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3] @@ -362,7 +362,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: constant_sextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3] @@ -379,7 +379,7 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX8-LABEL: constant_zextload_i8_align2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -398,7 +398,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: constant_zextload_i8_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -409,7 +409,7 @@ define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: constant_zextload_i8_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index e9797fa1fc309f..422e2747094ce2 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: s_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: s_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: s_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -47,7 +47,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: s_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: s_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -73,7 +73,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: s_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -95,7 +95,7 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -110,7 +110,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_add_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -153,7 +153,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_add_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -169,7 +169,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -193,7 +193,7 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: s_add_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: s_add_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -231,7 +231,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_add_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -249,7 +249,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_add_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -267,7 +267,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_add_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -286,7 +286,7 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: s_add_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -313,36 +313,36 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) { ; GFX6-LABEL: s_add_v8i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s23, 0xf000 -; GFX6-NEXT: s_mov_b32 s22, -1 +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s7, s15 -; GFX6-NEXT: s_add_i32 s1, s6, s14 -; GFX6-NEXT: s_add_i32 s2, s5, s13 -; GFX6-NEXT: s_add_i32 s3, s4, s12 -; GFX6-NEXT: s_add_i32 s4, s11, s19 -; GFX6-NEXT: s_add_i32 s5, s10, s18 -; GFX6-NEXT: s_add_i32 s6, s9, s17 -; GFX6-NEXT: s_add_i32 s7, s8, s16 -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 +; GFX6-NEXT: s_add_i32 s11, s11, s19 +; GFX6-NEXT: s_add_i32 s10, s10, s18 +; GFX6-NEXT: s_add_i32 s9, s9, s17 +; GFX6-NEXT: s_add_i32 s8, s8, s16 +; GFX6-NEXT: s_add_i32 s7, s7, s15 +; GFX6-NEXT: s_add_i32 s6, s6, s14 +; GFX6-NEXT: s_add_i32 s5, s5, s13 +; GFX6-NEXT: s_add_i32 s4, s4, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v8i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s7, s7, s15 ; GFX8-NEXT: s_add_i32 s6, s6, s14 @@ -372,9 +372,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX9-LABEL: s_add_v8i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s7, s15 ; GFX9-NEXT: s_add_i32 s3, s6, s14 @@ -399,10 +399,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX10-LABEL: s_add_v8i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s2, s7, s15 ; GFX10-NEXT: s_add_i32 s3, s6, s14 @@ -427,8 +426,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX11-LABEL: s_add_v8i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s2, s7, s15 ; GFX11-NEXT: s_add_i32 s3, s6, s14 @@ -453,8 +452,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX12-LABEL: s_add_v8i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s15 ; GFX12-NEXT: s_add_co_i32 s3, s6, s14 @@ -484,58 +483,58 @@ entry: define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) { ; GFX6-LABEL: s_add_v16i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; GFX6-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29 -; GFX6-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s23, 0xf000 -; GFX6-NEXT: s_mov_b32 s22, -1 +; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; GFX6-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x29 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s7, s39 -; GFX6-NEXT: s_add_i32 s1, s6, s38 -; GFX6-NEXT: s_add_i32 s2, s5, s37 -; GFX6-NEXT: s_add_i32 s3, s4, s36 -; GFX6-NEXT: s_add_i32 s4, s11, s43 -; GFX6-NEXT: s_add_i32 s5, s10, s42 -; GFX6-NEXT: s_add_i32 s6, s9, s41 -; GFX6-NEXT: s_add_i32 s7, s8, s40 -; GFX6-NEXT: s_add_i32 s8, s15, s47 -; GFX6-NEXT: s_add_i32 s9, s14, s46 -; GFX6-NEXT: s_add_i32 s10, s13, s45 -; GFX6-NEXT: s_add_i32 s11, s12, s44 -; GFX6-NEXT: s_add_i32 s12, s19, s51 -; GFX6-NEXT: s_add_i32 s13, s18, s50 -; GFX6-NEXT: s_add_i32 s14, s17, s49 -; GFX6-NEXT: s_add_i32 s15, s16, s48 -; GFX6-NEXT: v_mov_b32_e32 v0, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 +; GFX6-NEXT: s_add_i32 s19, s19, s51 +; GFX6-NEXT: s_add_i32 s18, s18, s50 +; GFX6-NEXT: s_add_i32 s17, s17, s49 +; GFX6-NEXT: s_add_i32 s16, s16, s48 +; GFX6-NEXT: s_add_i32 s15, s15, s47 +; GFX6-NEXT: s_add_i32 s14, s14, s46 +; GFX6-NEXT: s_add_i32 s13, s13, s45 +; GFX6-NEXT: s_add_i32 s12, s12, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_add_i32 s11, s11, s43 +; GFX6-NEXT: s_add_i32 s10, s10, s42 +; GFX6-NEXT: s_add_i32 s9, s9, s41 +; GFX6-NEXT: s_add_i32 s8, s8, s40 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_add_i32 s7, s7, s39 +; GFX6-NEXT: s_add_i32 s6, s6, s38 +; GFX6-NEXT: s_add_i32 s5, s5, s37 +; GFX6-NEXT: s_add_i32 s4, s4, s36 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_add_v16i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX8-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX8-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s7, s7, s39 ; GFX8-NEXT: s_add_i32 s6, s6, s38 @@ -591,10 +590,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; ; GFX9-LABEL: s_add_v16i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s7, s39 ; GFX9-NEXT: s_add_i32 s3, s6, s38 @@ -638,11 +637,11 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; ; GFX10-LABEL: s_add_v16i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s2, s7, s39 ; GFX10-NEXT: s_add_i32 s3, s6, s38 @@ -685,9 +684,9 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX11-LABEL: s_add_v16i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s2, s7, s39 ; GFX11-NEXT: s_add_i32 s3, s6, s38 @@ -726,9 +725,9 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX12-LABEL: s_add_v16i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0xa4 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s39 ; GFX12-NEXT: s_add_co_i32 s3, s6, s38 @@ -772,7 +771,7 @@ entry: define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -793,7 +792,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX8-LABEL: v_add_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -813,7 +812,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_add_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -827,7 +826,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_add_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -841,11 +840,9 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_add_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -859,11 +856,9 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_add_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -887,7 +882,7 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_add_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -906,7 +901,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_add_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -922,7 +917,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_add_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -934,7 +929,7 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_add_imm_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -946,10 +941,8 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_add_imm_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -961,10 +954,8 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_add_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -985,8 +976,8 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: add64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1001,8 +992,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX8-LABEL: add64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s0, s6, s0 @@ -1015,12 +1006,12 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: add64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -1029,12 +1020,12 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX10-LABEL: add64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s6, s0 -; GFX10-NEXT: s_addc_u32 s1, s7, s1 +; GFX10-NEXT: s_add_u32 s0, s6, s2 +; GFX10-NEXT: s_addc_u32 s1, s7, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -1043,8 +1034,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: add64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 ; GFX11-NEXT: s_addc_u32 s1, s7, s1 @@ -1058,8 +1049,8 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-LABEL: add64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1081,8 +1072,8 @@ entry: define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) { ; GFX6-LABEL: add64_sgpr_vgpr: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1099,8 +1090,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX8-LABEL: add64_sgpr_vgpr: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1115,11 +1106,11 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; ; GFX9-LABEL: add64_sgpr_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -1131,11 +1122,11 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX10-LABEL: add64_sgpr_vgpr: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s6, s0 ; GFX10-NEXT: s_addc_u32 s1, s7, s1 @@ -1147,8 +1138,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX11-LABEL: add64_sgpr_vgpr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1164,8 +1155,8 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-LABEL: add64_sgpr_vgpr: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1187,7 +1178,7 @@ entry: define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; GFX6-LABEL: add64_in_branch: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -1214,7 +1205,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add64_in_branch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -1240,7 +1231,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -1265,7 +1256,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: add64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 @@ -1288,7 +1279,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: add64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 @@ -1312,7 +1303,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: add64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index b751be51a97393..6f67ce4de9ce54 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -33,13 +33,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 @@ -65,12 +65,10 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -95,8 +93,8 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; VI-LABEL: s_test_add_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -116,37 +114,37 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_test_add_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 +; GFX10-NEXT: v_pk_add_u16 v1, s0, s1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 @@ -167,7 +165,7 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; VI-LABEL: s_test_add_self_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -186,7 +184,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: s_test_add_self_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -197,7 +195,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_add_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -208,7 +206,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_add_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -228,7 +226,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -245,7 +243,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -255,7 +253,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 @@ -264,7 +262,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 @@ -281,7 +279,7 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -300,7 +298,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_add_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -313,7 +311,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_add_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -325,10 +323,8 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_add_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -350,7 +346,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,7 +365,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -382,7 +378,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_add_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -394,10 +390,8 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_add_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -418,7 +412,7 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -437,7 +431,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_add_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -449,7 +443,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_add_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +455,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_add_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -485,7 +477,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,7 +495,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -515,7 +507,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -527,10 +519,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -552,7 +542,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; VI-LABEL: v_test_add_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -570,7 +560,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -582,7 +572,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -594,10 +584,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -619,8 +607,8 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -642,14 +630,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -660,13 +648,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -678,12 +666,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -713,8 +699,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -738,14 +724,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 @@ -757,13 +743,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -776,10 +762,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -811,8 +795,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -836,14 +820,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 @@ -854,13 +838,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -872,12 +856,10 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -907,8 +889,8 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -933,13 +915,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -953,14 +935,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -974,12 +956,10 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 1315d576a83eb6..526d5c946ec7f6 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,7 +1,5 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 87d9a52118cefc..e44d73b2cecd0b 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -244,7 +244,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: no_agpr_no_reserve: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 @@ -302,8 +302,7 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: no_agpr_no_reserve: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 @@ -515,14 +514,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX908-NEXT: s_load_dword s9, s[6:7], 0x18 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX908-NEXT: s_load_dword s9, s[4:5], 0x18 ; GFX908-NEXT: s_mov_b32 s8, 0 -; GFX908-NEXT: s_mov_b32 s7, s8 +; GFX908-NEXT: s_mov_b32 s5, s8 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s6, 0, s3 +; GFX908-NEXT: s_sub_i32 s4, 0, s3 ; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 ; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -531,32 +530,32 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s6, s6, s10 -; GFX908-NEXT: s_mul_hi_u32 s6, s10, s6 -; GFX908-NEXT: s_add_i32 s10, s10, s6 -; GFX908-NEXT: s_mul_hi_u32 s6, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s6, s3 +; GFX908-NEXT: s_mul_i32 s4, s4, s10 +; GFX908-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX908-NEXT: s_add_i32 s10, s10, s4 +; GFX908-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX908-NEXT: s_mul_i32 s10, s4, s3 ; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s6, 1 +; GFX908-NEXT: s_add_i32 s11, s4, 1 ; GFX908-NEXT: s_sub_i32 s10, s2, s3 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s6, s11, s6 +; GFX908-NEXT: s_cselect_b32 s4, s11, s4 ; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s6, 1 +; GFX908-NEXT: s_add_i32 s10, s4, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s6, s10, s6 +; GFX908-NEXT: s_cselect_b32 s4, s10, s4 ; GFX908-NEXT: s_lshr_b32 s9, s9, 16 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 ; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s7, v16 -; GFX908-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX908-NEXT: s_mul_i32 s1, s1, s7 -; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7 -; GFX908-NEXT: s_mul_i32 s0, s0, s7 +; GFX908-NEXT: v_readfirstlane_b32 s5, v16 +; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX908-NEXT: s_mul_i32 s1, s1, s5 +; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX908-NEXT: s_mul_i32 s0, s0, s5 ; GFX908-NEXT: s_add_i32 s1, s9, s1 ; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 @@ -572,7 +571,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX908-NEXT: s_mov_b32 s9, s8 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s8 @@ -582,20 +581,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_mov_b32_e32 v5, s9 ; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s7, v2 +; GFX908-NEXT: v_readfirstlane_b32 s5, v2 ; GFX908-NEXT: v_readfirstlane_b32 s9, v3 -; GFX908-NEXT: s_add_u32 s7, s7, 1 +; GFX908-NEXT: s_add_u32 s5, s5, 1 ; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s20, s2, s7 +; GFX908-NEXT: s_mul_hi_u32 s20, s2, s5 ; GFX908-NEXT: s_mul_i32 s9, s2, s9 -; GFX908-NEXT: s_mul_i32 s21, s3, s7 +; GFX908-NEXT: s_mul_i32 s21, s3, s5 ; GFX908-NEXT: s_add_i32 s9, s20, s9 -; GFX908-NEXT: s_mul_i32 s7, s2, s7 +; GFX908-NEXT: s_mul_i32 s5, s2, s5 ; GFX908-NEXT: s_add_i32 s9, s9, s21 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 @@ -611,7 +610,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s20, s18, s7 +; GFX908-NEXT: s_add_u32 s20, s18, s5 ; GFX908-NEXT: s_addc_u32 s21, s19, s9 ; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -671,8 +670,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s4, s4, s6 -; GFX908-NEXT: s_addc_u32 s5, s5, 0 +; GFX908-NEXT: s_add_u32 s6, s6, s4 +; GFX908-NEXT: s_addc_u32 s7, s7, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 ; GFX908-NEXT: s_mov_b64 s[0:1], 0 @@ -683,14 +682,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX90A-NEXT: s_load_dword s9, s[6:7], 0x18 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 ; GFX90A-NEXT: s_mov_b32 s8, 0 -; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: s_mov_b32 s5, s8 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s6, 0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 ; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0 @@ -698,32 +697,32 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s9 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v3 -; GFX90A-NEXT: s_mul_i32 s6, s6, s10 -; GFX90A-NEXT: s_mul_hi_u32 s6, s10, s6 -; GFX90A-NEXT: s_add_i32 s10, s10, s6 -; GFX90A-NEXT: s_mul_hi_u32 s6, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s6, s3 +; GFX90A-NEXT: s_mul_i32 s4, s4, s10 +; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX90A-NEXT: s_add_i32 s10, s10, s4 +; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX90A-NEXT: s_mul_i32 s10, s4, s3 ; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s6, 1 +; GFX90A-NEXT: s_add_i32 s11, s4, 1 ; GFX90A-NEXT: s_sub_i32 s10, s2, s3 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s6, s11, s6 +; GFX90A-NEXT: s_cselect_b32 s4, s11, s4 ; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s6, 1 +; GFX90A-NEXT: s_add_i32 s10, s4, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s6, s10, s6 +; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 ; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s9 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[4:5], 5 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s7, v18 -; GFX90A-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX90A-NEXT: s_mul_i32 s1, s1, s7 -; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7 -; GFX90A-NEXT: s_mul_i32 s0, s0, s7 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 +; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX90A-NEXT: s_mul_i32 s1, s1, s5 +; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX90A-NEXT: s_mul_i32 s0, s0, s5 ; GFX90A-NEXT: s_add_i32 s1, s9, s1 ; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 @@ -739,26 +738,26 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX90A-NEXT: s_mov_b32 s9, s8 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s7, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 -; GFX90A-NEXT: s_add_u32 s7, s7, 1 +; GFX90A-NEXT: s_add_u32 s5, s5, 1 ; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s7 +; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s5 ; GFX90A-NEXT: s_mul_i32 s9, s2, s9 -; GFX90A-NEXT: s_mul_i32 s21, s3, s7 +; GFX90A-NEXT: s_mul_i32 s21, s3, s5 ; GFX90A-NEXT: s_add_i32 s9, s20, s9 -; GFX90A-NEXT: s_mul_i32 s7, s2, s7 +; GFX90A-NEXT: s_mul_i32 s5, s2, s5 ; GFX90A-NEXT: s_add_i32 s9, s9, s21 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 @@ -774,7 +773,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s20, s18, s7 +; GFX90A-NEXT: s_add_u32 s20, s18, s5 ; GFX90A-NEXT: s_addc_u32 s21, s19, s9 ; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -827,8 +826,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s4, s4, s6 -; GFX90A-NEXT: s_addc_u32 s5, s5, 0 +; GFX90A-NEXT: s_add_u32 s6, s6, s4 +; GFX90A-NEXT: s_addc_u32 s7, s7, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll index 8d87b53efb4e73..bd5dc6e2070986 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -175,4 +175,4 @@ bb: ret void } -attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 0a461f9ee6c968..0c5e1ec0d5b6f1 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) { ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll index def6df9adf5977..330cf48803680d 100644 --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -41,16 +41,16 @@ define void @test1() { define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; GFX9-LABEL: test2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lt_i32 s0, 1 +; GFX9-NEXT: s_cmp_lt_i32 s2, 1 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB2_2: ; %then @@ -58,16 +58,16 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX10-LABEL: test2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lt_i32 s0, 1 +; GFX10-NEXT: s_cmp_lt_i32 s2, 1 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB2_2: ; %then @@ -75,15 +75,15 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) { ; ; GFX11-LABEL: test2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-NEXT: s_cmp_lt_i32 s2, 1 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB2_2: ; %then diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index bf72cccd912cee..cb59121d697083 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -392,7 +392,7 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x83 ; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 7cf18171a6cd74..8144fb7a3b6461 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -137,7 +137,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -167,7 +167,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -280,7 +280,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s4, s3 @@ -359,7 +359,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +394,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_abs_i32 s3, s3 @@ -452,15 +452,15 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: udiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -468,20 +468,19 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -489,7 +488,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i16 %x, %y @@ -523,37 +521,36 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: urem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s5, s4, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_and_b32 s0, s4, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b32 s2, s4, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_and_b32 s3, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -562,8 +559,8 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -600,8 +597,8 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: sdiv_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -626,27 +623,27 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: sdiv_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_ashr_i32 s0, s4, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i16 %x, %y store i16 %r, ptr addrspace(1) %out @@ -683,8 +680,8 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX6-LABEL: srem_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s5, s4, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -711,8 +708,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -722,6 +718,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -733,6 +730,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i16 %x, %y @@ -764,8 +762,8 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: udiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -783,13 +781,13 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: udiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -829,8 +827,8 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: urem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 @@ -851,13 +849,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 -; GFX9-NEXT: s_lshr_b32 s2, s4, 8 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -865,8 +863,9 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem i8 %x, %y @@ -902,8 +901,8 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: sdiv_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -928,27 +927,27 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i8 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -985,8 +984,8 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX6-LABEL: srem_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -1014,30 +1013,30 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { ; ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i8 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i8 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i8 %x, %y store i8 %r, ptr addrspace(1) %out @@ -1179,13 +1178,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: s_sub_i32 s0, 0, s12 +; GFX6-NEXT: s_sub_i32 s2, 0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -1195,28 +1194,28 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s12 -; GFX6-NEXT: s_sub_i32 s0, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s12 -; GFX6-NEXT: s_cmp_ge_u32 s0, s12 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s12 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s12 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s13 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 @@ -1277,9 +1276,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1499,36 +1498,34 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -1536,58 +1533,60 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s9 -; GFX6-NEXT: s_sub_i32 s1, s5, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s5, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s10 -; GFX6-NEXT: s_sub_i32 s4, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s10 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s6, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: s_mul_i32 s0, s0, s11 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1843,34 +1842,34 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 ; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_abs_i32 s2, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_xor_b32 s4, s8, s12 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s8 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: s_mul_i32 s4, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_abs_i32 s4, s13 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX6-NEXT: s_sub_i32 s5, 0, s4 @@ -1878,7 +1877,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: s_xor_b32 s6, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 @@ -1965,16 +1964,17 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s1, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_xor_b32 s3, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s1, s1, 31 +; GFX9-NEXT: s_ashr_i32 s3, s3, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 @@ -1982,82 +1982,81 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s12, s8, s0 +; GFX9-NEXT: s_mul_i32 s12, s8, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s12 ; GFX9-NEXT: s_add_i32 s13, s8, 1 -; GFX9-NEXT: s_sub_i32 s12, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s12, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s8, s13, s8 ; GFX9-NEXT: s_cselect_b32 s4, s12, s4 ; GFX9-NEXT: s_add_i32 s12, s8, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s12, s8 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s12, s8 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_xor_b32 s8, s5, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: s_ashr_i32 s8, s8, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s1, s0, s4 -; GFX9-NEXT: s_sub_i32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s5, s1, s4 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_add_i32 s5, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_xor_b32 s4, s6, s10 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s9, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s12, s3, 1 +; GFX9-NEXT: s_sub_i32 s9, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s12, s3 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_add_i32 s9, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s9, s3 +; GFX9-NEXT: s_abs_i32 s4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_xor_b32 s5, s6, s10 +; GFX9-NEXT: s_abs_i32 s6, s6 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s6, s0, s1 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s6, s5, s1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s5, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 +; GFX9-NEXT: s_mul_i32 s9, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s9 +; GFX9-NEXT: s_add_i32 s10, s8, 1 +; GFX9-NEXT: s_sub_i32 s9, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s8, s10, s8 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_add_i32 s9, s8, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s9, s8 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_xor_b32 s2, s7, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_ashr_i32 s2, s2, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2077,7 +2076,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i32> %x, %y @@ -2244,34 +2242,35 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_abs_i32 s0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_abs_i32 s2, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: s_mul_i32 s8, s8, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s8, s1 -; GFX6-NEXT: s_sub_i32 s8, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s0, s8, s1 -; GFX6-NEXT: s_abs_i32 s1, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s8, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s0, s4 -; GFX6-NEXT: s_sub_i32 s0, s0, s4 +; GFX6-NEXT: s_mul_i32 s8, s8, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s8 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s8, s3 +; GFX6-NEXT: s_sub_i32 s8, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s2, s8, s3 +; GFX6-NEXT: s_abs_i32 s3, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2281,22 +2280,21 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s4, s8, s4 -; GFX6-NEXT: s_sub_i32 s8, s4, s1 -; GFX6-NEXT: s_cmp_ge_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s1, s8, s4 -; GFX6-NEXT: s_abs_i32 s4, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX6-NEXT: s_sub_i32 s8, 0, s4 -; GFX6-NEXT: s_xor_b32 s1, s1, s5 -; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_abs_i32 s3, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s8, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s5 +; GFX6-NEXT: s_sub_i32 s5, s2, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 @@ -2305,59 +2303,59 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s5, v0 -; GFX6-NEXT: s_mul_i32 s5, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s5, s8, s5 -; GFX6-NEXT: s_sub_i32 s8, s5, s4 -; GFX6-NEXT: s_cmp_ge_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s8, s5 -; GFX6-NEXT: s_abs_i32 s5, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_sub_i32 s8, 0, s5 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s8, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s8, s8, s2 +; GFX6-NEXT: s_abs_i32 s9, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_abs_i32 s0, s7 -; GFX6-NEXT: v_mul_lo_u32 v1, s8, v2 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_xor_b32 s2, s4, s6 -; GFX6-NEXT: s_sub_i32 s2, s2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_abs_i32 s4, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v2 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_ashr_i32 s1, s7, 31 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s7, 31 +; GFX6-NEXT: s_xor_b32 s7, s8, s6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v2 -; GFX6-NEXT: s_mul_i32 s3, s3, s5 -; GFX6-NEXT: s_sub_i32 s0, s0, s3 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_sub_i32 s3, s0, s5 -; GFX6-NEXT: s_cmp_ge_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s0, s3, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 +; GFX6-NEXT: s_sub_i32 s6, s7, s6 +; GFX6-NEXT: v_readfirstlane_b32 s7, v2 +; GFX6-NEXT: s_mul_i32 s7, s7, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s7, s4, s9 +; GFX6-NEXT: s_cmp_ge_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 -; GFX9-NEXT: s_ashr_i32 s1, s4, 31 +; GFX9-NEXT: s_abs_i32 s2, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_ashr_i32 s3, s4, 31 ; GFX9-NEXT: s_abs_i32 s4, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2367,73 +2365,72 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 ; GFX9-NEXT: s_add_i32 s12, s12, s8 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 -; GFX9-NEXT: s_mul_i32 s8, s8, s0 +; GFX9-NEXT: s_mul_i32 s8, s8, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s0, s8, s4 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s8, s4 ; GFX9-NEXT: s_abs_i32 s4, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_sub_i32 s9, 0, s4 -; GFX9-NEXT: s_sub_i32 s12, s0, s1 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s9, s9, s0 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s9 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s0, s4 -; GFX9-NEXT: s_cmp_ge_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s0, s1, s0 -; GFX9-NEXT: s_abs_i32 s1, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: s_ashr_i32 s4, s6, 31 -; GFX9-NEXT: s_abs_i32 s5, s6 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s3 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9 +; GFX9-NEXT: s_add_i32 s3, s3, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_abs_i32 s4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s8 +; GFX9-NEXT: s_sub_i32 s9, 0, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s6, 0, s1 -; GFX9-NEXT: s_sub_i32 s8, s0, s8 +; GFX9-NEXT: s_ashr_i32 s5, s6, 31 +; GFX9-NEXT: s_abs_i32 s6, s6 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 -; GFX9-NEXT: s_sub_i32 s5, s0, s1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s5, s5, s0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8 +; GFX9-NEXT: s_mul_i32 s8, s8, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_sub_i32 s8, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s6, s4 +; GFX9-NEXT: s_cselect_b32 s4, s8, s6 ; GFX9-NEXT: s_abs_i32 s6, s11 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_ashr_i32 s2, s7, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_abs_i32 s3, s7 ; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 ; GFX9-NEXT: s_mul_i32 s7, s7, s5 ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 @@ -2451,7 +2448,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x ; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i32> %x, %y @@ -2546,8 +2542,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2606,21 +2602,21 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -2658,7 +2654,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y @@ -2761,8 +2756,8 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2829,34 +2824,35 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s8, s5, 0xffff ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -2871,21 +2867,20 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 @@ -2999,8 +2994,8 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3079,79 +3074,79 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s0, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 +; GFX9-NEXT: s_xor_b32 s0, s4, s1 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: s_or_b32 s4, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: s_ashr_i32 s3, s7, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 -; GFX9-NEXT: s_ashr_i32 s2, s5, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 +; GFX9-NEXT: s_ashr_i32 s0, s5, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3269,8 +3264,8 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3361,78 +3356,78 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX9-NEXT: s_xor_b32 s2, s9, s8 +; GFX9-NEXT: s_xor_b32 s0, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s10, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v1, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s6 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_xor_b32 s0, s4, s6 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 ; GFX9-NEXT: s_sext_i32_i16 s8, s7 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: s_sext_i32_i16 s6, s5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s2, s6, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s10, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s6, s8 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s10, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s10, 0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: s_ashr_i32 s7, s7, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s2, s5, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_xor_b32 s0, s5, s7 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_or_b32 s8, s2, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 +; GFX9-NEXT: s_or_b32 s8, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 @@ -3441,7 +3436,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3472,8 +3467,8 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: udiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 @@ -3494,15 +3489,15 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: udiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s2, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s0, s4, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 @@ -3510,7 +3505,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v2, v0, s[0:1] +; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3543,8 +3538,8 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: urem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -3568,24 +3563,24 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GFX9-NEXT: s_and_b32 s1, s4, 7 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_and_b32 s4, s2, 7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] @@ -3623,8 +3618,8 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: sdiv_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3650,28 +3645,28 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: sdiv_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 -; GFX9-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i3 %x, %y store i3 %r, ptr addrspace(1) %out @@ -3708,8 +3703,8 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX6-LABEL: srem_i3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -3738,27 +3733,27 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { ; ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX9-NEXT: s_xor_b32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_lshr_b32 s5, s4, 8 -; GFX9-NEXT: s_or_b32 s6, s0, 1 +; GFX9-NEXT: s_or_b32 s6, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s6, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 @@ -3837,8 +3832,8 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3884,21 +3879,21 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 @@ -3923,7 +3918,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4005,8 +3999,8 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4058,33 +4052,33 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s6, 0xffff +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_and_b32 s8, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 -; GFX9-NEXT: s_and_b32 s3, s5, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4093,16 +4087,17 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4190,8 +4185,8 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4252,62 +4247,62 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX9-NEXT: s_xor_b32 s2, s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s0, s6 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9-NEXT: s_sext_i32_i16 s1, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s8, s2, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s8, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s8, 0 -; GFX9-NEXT: s_ashr_i32 s3, s6, 16 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s8, 0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: v_add_u32_e32 v2, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s2, s4, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s4, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_sext_i32_i16 s3, s7 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 +; GFX9-NEXT: s_sext_i32_i16 s0, s5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s4, s2, 1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[0:1] +; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4399,8 +4394,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -4469,8 +4464,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 @@ -4522,6 +4516,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 @@ -4529,6 +4524,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4604,31 +4600,33 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s10, 0x7fff -; GFX6-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX6-NEXT: s_bfe_u32 s3, s10, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4651,33 +4649,31 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff -; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff +; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f -; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -4791,41 +4787,41 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s3, s10, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -4834,32 +4830,32 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 +; GFX6-NEXT: s_lshr_b32 s5, s8, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s2, s10, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 @@ -5000,50 +4996,52 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s1, s1, s2 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 -; GFX6-NEXT: s_or_b32 s1, s1, 1 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_or_b32 s7, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v4 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: s_cselect_b32 s4, s7, 0 +; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 30 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -5061,46 +5059,43 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_bfe_i32 s2, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 -; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 +; GFX9-NEXT: s_or_b32 s3, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v5 -; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_cselect_b32 s0, s3, 0 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -5110,6 +5105,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 @@ -5227,73 +5223,73 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX6-NEXT: s_xor_b32 s1, s2, s1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: s_mov_b32 s4, s8 -; GFX6-NEXT: s_mov_b32 s5, s9 -; GFX6-NEXT: s_lshr_b32 s8, s10, 15 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: s_lshr_b32 s9, s0, 15 -; GFX6-NEXT: s_or_b32 s1, s1, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX6-NEXT: s_cselect_b32 s1, s1, 0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v6 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX6-NEXT: s_bfe_i32 s1, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_lshr_b32 s7, s6, 15 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: s_lshr_b32 s9, s8, 15 +; GFX6-NEXT: s_or_b32 s10, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v6 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX6-NEXT: s_or_b32 s2, s0, 1 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s0, s2, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s0, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -5301,54 +5297,54 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s2, s6, 0xf0000 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX9-NEXT: s_xor_b32 s1, s2, s1 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s8, s6, 15 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: s_lshr_b32 s7, s0, 15 -; GFX9-NEXT: s_or_b32 s1, s1, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v6 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 +; GFX9-NEXT: s_lshr_b32 s3, s2, 15 +; GFX9-NEXT: s_or_b32 s7, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s7, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 ; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 @@ -5367,7 +5363,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -5397,8 +5393,8 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -5413,17 +5409,17 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s1, s4, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s1, 20 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5438,8 +5434,8 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: udiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5450,13 +5446,13 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: udiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5472,7 +5468,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5486,7 +5482,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s3, s3, 12 @@ -5513,7 +5509,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5528,7 +5524,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 12 @@ -5555,7 +5551,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -5574,7 +5570,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 @@ -5664,42 +5660,42 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX6-NEXT: s_sub_i32 s4, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 @@ -5720,54 +5716,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s3, s2, s6 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s9, s2, 1 -; GFX9-NEXT: s_sub_i32 s4, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s2, s9, s2 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s4, s2, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s7, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_sub_i32 s7, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s4, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s6, s3, 1 -; GFX9-NEXT: s_sub_i32 s5, s4, s7 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s3, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, s7, s6 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s6, s4, s2 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s7, s4, 1 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s2, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -5784,10 +5780,10 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 ; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -5803,19 +5799,19 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881 -; GFX9-NEXT: s_sub_i32 s3, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 20 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX9-NEXT: s_sub_i32 s1, s4, s0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s1, 20 +; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -5830,8 +5826,8 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: urem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5842,13 +5838,13 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: urem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -5864,7 +5860,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5879,7 +5875,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -5907,7 +5903,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5922,7 +5918,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -6004,35 +6000,35 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s4, s4, s1 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 -; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s4, s4, s3 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -6049,56 +6045,55 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cselect_b32 s5, s7, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, s6 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s2, s6 -; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 -; GFX9-NEXT: s_mul_i32 s3, s3, s8 -; GFX9-NEXT: s_mul_hi_u32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s8 -; GFX9-NEXT: s_mul_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s4 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s4, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6115,8 +6110,8 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -6131,17 +6126,17 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s1, s0, 31 +; GFX9-NEXT: s_ashr_i32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6156,8 +6151,8 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: sdiv_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6171,16 +6166,16 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: sdiv_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s4, s4, s2 -; GFX9-NEXT: s_ashr_i32 s2, s4, 12 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s4, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s4, s4, s0 +; GFX9-NEXT: s_ashr_i32 s0, s4, 12 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6196,7 +6191,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6239,7 +6234,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -6294,7 +6289,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6315,7 +6310,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6348,7 +6343,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -6370,7 +6365,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6481,50 +6476,50 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s1, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX6-NEXT: s_sub_i32 s6, 0, s1 -; GFX6-NEXT: s_xor_b32 s0, s4, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s4, s2 ; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX6-NEXT: s_abs_i32 s6, s4 -; GFX6-NEXT: s_ashr_i32 s4, s0, 31 +; GFX6-NEXT: s_ashr_i32 s4, s2, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s6, s2, s3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 +; GFX6-NEXT: s_cselect_b32 s2, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s0, s1 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: s_abs_i32 s6, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_xor_b32 s7, s5, s7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_abs_i32 s5, s5 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_ashr_i32 s7, s7, 31 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 @@ -6544,73 +6539,71 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s1, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_xor_b32 s0, s4, s0 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: s_xor_b32 s2, s4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s0, 31 -; GFX9-NEXT: s_sub_i32 s0, 0, s1 +; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s8, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s7, s8 -; GFX9-NEXT: s_mul_i32 s8, s0, s1 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8 +; GFX9-NEXT: s_mul_i32 s8, s4, s3 ; GFX9-NEXT: s_sub_i32 s7, s7, s8 -; GFX9-NEXT: s_add_i32 s9, s0, 1 -; GFX9-NEXT: s_sub_i32 s8, s7, s1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s0, s9, s0 +; GFX9-NEXT: s_add_i32 s9, s4, 1 +; GFX9-NEXT: s_sub_i32 s8, s7, s3 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 ; GFX9-NEXT: s_cselect_b32 s7, s8, s7 -; GFX9-NEXT: s_add_i32 s8, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s7, s8, s0 -; GFX9-NEXT: s_abs_i32 s8, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_xor_b32 s2, s5, s6 -; GFX9-NEXT: s_abs_i32 s3, s5 +; GFX9-NEXT: s_add_i32 s8, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s3, s8, s4 +; GFX9-NEXT: s_abs_i32 s4, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s4 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s7, s4 -; GFX9-NEXT: s_sub_i32 s6, 0, s8 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_abs_i32 s5, s5 +; GFX9-NEXT: s_ashr_i32 s6, s6, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 31 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX9-NEXT: s_mul_i32 s6, s5, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s3 +; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7 +; GFX9-NEXT: s_add_i32 s3, s3, s7 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s7, s3, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_add_i32 s8, s3, 1 +; GFX9-NEXT: s_sub_i32 s7, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s6, s5 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6627,9 +6620,9 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6647,19 +6640,19 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s2, s4, 0xd9528441 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 31 -; GFX9-NEXT: s_ashr_i32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s1, s0, 31 +; GFX9-NEXT: s_ashr_i32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 1235195 store i32 %r, ptr addrspace(1) %out @@ -6674,8 +6667,8 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX6-LABEL: srem_i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6690,17 +6683,17 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { ; ; GFX9-LABEL: srem_i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s4, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_i32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s4, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s0, s4, s0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, 4096 store i32 %r, ptr addrspace(1) %out @@ -6716,7 +6709,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -6753,7 +6746,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 @@ -6805,7 +6798,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6828,7 +6821,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 31 @@ -6934,44 +6927,44 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX6-NEXT: s_abs_i32 s0, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX6-NEXT: s_abs_i32 s1, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_abs_i32 s3, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s1, s7 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s1, s7, s1 -; GFX6-NEXT: s_sub_i32 s7, s1, s0 -; GFX6-NEXT: s_cmp_ge_u32 s1, s0 -; GFX6-NEXT: s_cselect_b32 s7, s7, s1 +; GFX6-NEXT: s_mul_i32 s7, s7, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s7 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s7, s3 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s7, s7, s3 ; GFX6-NEXT: s_abs_i32 s6, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: s_abs_i32 s8, s5 ; GFX6-NEXT: s_xor_b32 s7, s7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_sub_i32 s4, s7, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, 31 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -6989,20 +6982,20 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_sub_i32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_abs_i32 s0, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_abs_i32 s4, s4 @@ -7013,43 +7006,41 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s0 +; GFX9-NEXT: s_mul_i32 s7, s7, s2 ; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 -; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_sub_i32 s7, s4, s0 -; GFX9-NEXT: s_cmp_ge_u32 s4, s0 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_abs_i32 s7, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_sub_i32 s7, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s7, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_abs_i32 s3, s5 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_ashr_i32 s4, s5, 31 +; GFX9-NEXT: s_abs_i32 s5, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s5, s3, s7 -; GFX9-NEXT: s_cmp_ge_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 +; GFX9-NEXT: s_mul_i32 s6, s6, s3 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s3 +; GFX9-NEXT: s_cmp_ge_u32 s5, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -7078,7 +7069,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX6-NEXT: s_addc_u32 s5, s5, 0 ; GFX6-NEXT: s_mul_i32 s6, s5, 0x68958c89 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -7163,11 +7154,11 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_oddk_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0f6 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7276,7 +7267,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: udiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7290,7 +7281,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: udiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 @@ -7312,8 +7303,8 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: udiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7328,12 +7319,12 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: udiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, 12 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX9-NEXT: s_add_i32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -7357,8 +7348,8 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: udiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7373,17 +7364,17 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: udiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7403,8 +7394,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7490,13 +7481,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -7579,9 +7570,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, < ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7604,27 +7595,27 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s0, s8, 12 -; GFX6-NEXT: s_add_i32 s2, s10, 12 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_add_i32 s8, s8, 12 +; GFX6-NEXT: s_add_i32 s9, s10, 12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s2, s8, 12 ; GFX9-NEXT: s_add_i32 s8, s10, 12 @@ -7650,12 +7641,12 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_oddk_denom: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_add_u32 s0, 4, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_addc_u32 s1, 0, 0 ; GFX6-NEXT: s_or_b32 s0, vcc_lo, vcc_hi -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 ; GFX6-NEXT: s_cmp_lg_u32 s0, 0 ; GFX6-NEXT: s_mov_b32 s0, 0x689e0837 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 @@ -7746,11 +7737,11 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_oddk_denom: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 s0, 4, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0fc ; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: s_addc_u32 s0, s1, 0 @@ -7857,7 +7848,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: urem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 @@ -7871,7 +7862,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: urem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff @@ -7892,8 +7883,8 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: urem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s8, s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7910,11 +7901,11 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: urem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s2 ; GFX9-NEXT: s_add_u32 s0, s0, -1 ; GFX9-NEXT: s_addc_u32 s1, s1, -1 ; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] @@ -7941,8 +7932,8 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: urem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -7957,16 +7948,16 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: urem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0xfff -; GFX9-NEXT: s_and_b32 s3, s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: s_and_b32 s1, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -7989,31 +7980,31 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s10 -; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 -; GFX6-NEXT: s_add_u32 s2, s2, -1 -; GFX6-NEXT: s_addc_u32 s3, s3, -1 -; GFX6-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX6-NEXT: s_add_u32 s0, s0, -1 -; GFX6-NEXT: s_addc_u32 s1, s1, -1 -; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 +; GFX6-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 +; GFX6-NEXT: s_add_u32 s8, s8, -1 +; GFX6-NEXT: s_addc_u32 s9, s9, -1 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_add_u32 s8, s10, -1 +; GFX6-NEXT: s_addc_u32 s9, s11, -1 +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10 ; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8 @@ -8043,7 +8034,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_oddk_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s1, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -8159,7 +8150,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -8243,7 +8234,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: sdiv_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8261,7 +8252,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: sdiv_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -8287,21 +8278,21 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s8 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX6-NEXT: s_sub_u32 s4, 0, s10 ; GFX6-NEXT: s_subb_u32 s5, 0, s11 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8423,19 +8414,19 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_u32 s0, 0, s10 -; GFX9-NEXT: s_subb_u32 s1, 0, s11 +; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s2 +; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s5, s5, s2 +; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_sub_u32 s0, 0, s8 +; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -8445,60 +8436,61 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s12, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s3 -; GFX9-NEXT: s_mul_i32 s13, s1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 +; GFX9-NEXT: s_mul_i32 s12, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 +; GFX9-NEXT: s_mul_i32 s13, s1, s11 ; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s0, s3 +; GFX9-NEXT: s_mul_i32 s15, s0, s11 ; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s3, s12 -; GFX9-NEXT: s_mul_i32 s3, s3, s12 -; GFX9-NEXT: s_add_u32 s3, s14, s3 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 +; GFX9-NEXT: s_mul_i32 s11, s11, s12 +; GFX9-NEXT: s_add_u32 s11, s14, s11 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s2, s15 -; GFX9-NEXT: s_mul_i32 s15, s2, s15 -; GFX9-NEXT: s_add_u32 s3, s3, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s2, s12 -; GFX9-NEXT: s_addc_u32 s3, s13, s16 +; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 +; GFX9-NEXT: s_mul_i32 s15, s10, s15 +; GFX9-NEXT: s_add_u32 s11, s11, s15 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 +; GFX9-NEXT: s_addc_u32 s11, s13, s16 ; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s2, s12 -; GFX9-NEXT: s_add_u32 s3, s3, s12 +; GFX9-NEXT: s_mul_i32 s12, s10, s12 +; GFX9-NEXT: s_add_u32 s11, s11, s12 ; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s11, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s12 ; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 +; GFX9-NEXT: s_mul_i32 s11, s0, s10 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s3, s13, s3 +; GFX9-NEXT: s_add_i32 s11, s13, s11 ; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s3, s3, s1 +; GFX9-NEXT: s_add_i32 s11, s11, s1 ; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s2, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 +; GFX9-NEXT: s_mul_i32 s14, s10, s0 +; GFX9-NEXT: s_mul_i32 s16, s12, s11 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s3 +; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 ; GFX9-NEXT: s_add_u32 s0, s0, s16 ; GFX9-NEXT: s_addc_u32 s12, 0, s15 ; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 ; GFX9-NEXT: s_addc_u32 s0, s12, s13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 +; GFX9-NEXT: s_mul_i32 s11, s10, s11 +; GFX9-NEXT: s_add_u32 s0, s0, s11 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s2, s1 -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: s_addc_u32 s12, s10, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_readfirstlane_b32 s13, v1 ; GFX9-NEXT: s_mul_i32 s1, s6, s12 ; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 @@ -8514,24 +8506,24 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_mul_i32 s12, s7, s12 ; GFX9-NEXT: s_add_u32 s12, s0, s12 ; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s0, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s11, s12 +; GFX9-NEXT: s_mul_i32 s1, s9, s12 ; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s10, s12 +; GFX9-NEXT: s_mul_i32 s1, s8, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s14 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s6, s0, s11 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s10, v1 +; GFX9-NEXT: s_subb_u32 s6, s0, s9 +; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s6, s6, 0 -; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_cmp_ge_u32 s6, s9 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX9-NEXT: s_cmp_eq_u32 s6, s11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 +; GFX9-NEXT: s_cmp_eq_u32 s6, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -8549,10 +8541,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s11 +; GFX9-NEXT: s_cmp_ge_u32 s0, s9 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -8562,7 +8554,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 @@ -8589,8 +8581,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: sdiv_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -8613,25 +8605,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: sdiv_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 ; GFX9-NEXT: s_addc_u32 s5, s7, 0 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8651,8 +8643,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01 ; GFX6-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8752,17 +8744,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8 ; GFX9-NEXT: s_addc_u32 s5, 0, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 @@ -8846,11 +8838,11 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_sub_u32 s5, s6, s4 ; GFX9-NEXT: s_subb_u32 s4, s7, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -8873,36 +8865,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 -; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s16, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s16 -; GFX6-NEXT: s_mov_b32 s17, s16 -; GFX6-NEXT: s_addc_u32 s1, s1, s16 -; GFX6-NEXT: s_xor_b64 s[12:13], s[0:1], s[16:17] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s0, 0, s12 -; GFX6-NEXT: s_subb_u32 s1, 0, s13 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], 0x1000, s10 +; GFX6-NEXT: s_ashr_i32 s14, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s14 +; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: s_addc_u32 s3, s3, s14 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_u32 s10, 0, s2 +; GFX6-NEXT: s_subb_u32 s11, 0, s3 +; GFX6-NEXT: s_ashr_i32 s16, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_add_u32 s0, s4, s16 +; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX6-NEXT: s_addc_u32 s1, s5, s16 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -8921,11 +8914,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -8941,11 +8935,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -8955,28 +8946,29 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -8985,23 +8977,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[16:17] -; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: s_add_u32 s4, s14, s2 +; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] +; GFX6-NEXT: s_ashr_i32 s4, s13, 31 +; GFX6-NEXT: s_add_u32 s12, s12, s4 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s5, s15, s2 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GFX6-NEXT: s_mov_b32 s5, s4 +; GFX6-NEXT: s_addc_u32 s13, s13, s4 +; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9010,16 +9002,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s12, 0, s4 +; GFX6-NEXT: s_sub_u32 s2, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 -; GFX6-NEXT: s_subb_u32 s13, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX6-NEXT: s_subb_u32 s3, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9038,11 +9030,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9057,14 +9049,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s12 +; GFX6-NEXT: s_add_u32 s6, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s7, s7, s12 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s7, s7, s2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9080,25 +9072,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s5, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NEXT: v_mov_b32_e32 v7, s13 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s4, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9109,15 +9101,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -9130,19 +9122,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 @@ -9416,6 +9408,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y @@ -9432,6 +9425,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_oddk_denom: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x33fe64 ; GFX6-NEXT: s_add_u32 s0, 0x396, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x28100000 @@ -9451,7 +9445,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v2 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v3, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc @@ -9546,7 +9539,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { ; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 ; GFX9-NEXT: s_add_u32 s6, s6, s10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 ; GFX9-NEXT: s_addc_u32 s6, s8, s9 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 @@ -9633,7 +9626,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX6-LABEL: srem_i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9653,7 +9646,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { ; ; GFX9-LABEL: srem_i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -9681,21 +9674,21 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s4 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s1, s1, s4 -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX6-NEXT: s_addc_u32 s3, s3, s4 +; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_sub_u32 s4, 0, s8 ; GFX6-NEXT: s_subb_u32 s5, 0, s9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9815,17 +9808,17 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s2 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s8 ; GFX9-NEXT: s_subb_u32 s1, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 @@ -9979,8 +9972,8 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX6-LABEL: srem_v2i64_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10007,17 +10000,17 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; ; GFX9-LABEL: srem_v2i64_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 0xfffff000 -; GFX9-NEXT: s_sub_u32 s2, s4, s2 -; GFX9-NEXT: s_subb_u32 s3, s5, s3 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s0, s4, s0 +; GFX9-NEXT: s_subb_u32 s1, s5, s1 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 @@ -10025,11 +10018,11 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-NEXT: s_sub_u32 s4, s6, s4 ; GFX9-NEXT: s_subb_u32 s5, s7, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = srem <2 x i64> %x, store <2 x i64> %r, ptr addrspace(1) %out @@ -10052,36 +10045,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s8, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s8 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s1, s1, s8 -; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[8:9] +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX6-NEXT: s_sub_u32 s0, 0, s14 -; GFX6-NEXT: s_subb_u32 s1, 0, s15 +; GFX6-NEXT: s_sub_u32 s2, 0, s14 +; GFX6-NEXT: s_subb_u32 s3, 0, s15 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX6-NEXT: s_addc_u32 s1, s5, s12 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -10100,11 +10096,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -10120,11 +10116,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s5, s12 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -10305,19 +10298,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s8 ; GFX9-NEXT: s_lshl_b64 s[10:11], 0x1000, s10 -; GFX9-NEXT: s_ashr_i32 s8, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s1, s8 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: s_sub_u32 s0, 0, s12 ; GFX9-NEXT: s_subb_u32 s1, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll index 52e76dd24a20b4..9f5b6389ab59f5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -6,7 +6,7 @@ define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(ptr addrspace(1) ; GCN-LABEL: test_mul24_knownbits_kernel: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, -5, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffffe0, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll index 1358d91ae102c9..a35fbaadddf9ef 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll @@ -240,7 +240,7 @@ entry: define void @sincos_v2f32_nocontract(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32_nocontract -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -298,7 +298,7 @@ entry: define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32 -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -317,7 +317,7 @@ entry: define void @sincos_v3f32(<3 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v3f32 -; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <3 x float>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <3 x float> @_Z6sincosDv3_fPU3AS5S_(<3 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -340,7 +340,7 @@ entry: define void @sincos_v4f32(<4 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v4f32 -; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <4 x float>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <4 x float> @_Z6sincosDv4_fPU3AS5S_(<4 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -359,7 +359,7 @@ entry: define void @sincos_v8f32(<8 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v8f32 -; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <8 x float>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <8 x float> @_Z6sincosDv8_fPU3AS5S_(<8 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -378,7 +378,7 @@ entry: define void @sincos_v16f32(<16 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v16f32 -; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <16 x float>, align 64, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <16 x float> @_Z6sincosDv16_fPU3AS5S_(<16 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -397,7 +397,7 @@ entry: define void @sincos_f64_nocontract(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64_nocontract -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -417,7 +417,7 @@ entry: define void @sincos_v2f64_nocontract(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f64_nocontract -; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -436,7 +436,7 @@ entry: define void @sincos_f64(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64 -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -455,7 +455,7 @@ entry: define void @sincos_f64_order1(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f64_order1 -; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -474,7 +474,7 @@ entry: define void @sincos_v2f64(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f64 -; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -493,7 +493,7 @@ entry: define void @sincos_v3f64(<3 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v3f64 -; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <3 x double>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <3 x double> @_Z6sincosDv3_dPU3AS5S_(<3 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -516,7 +516,7 @@ entry: define void @sincos_v4f64(<4 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v4f64 -; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <4 x double>, align 32, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <4 x double> @_Z6sincosDv4_dPU3AS5S_(<4 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -535,7 +535,7 @@ entry: define void @sincos_v8f64(<8 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v8f64 -; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <8 x double>, align 64, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <8 x double> @_Z6sincosDv8_dPU3AS5S_(<8 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -554,7 +554,7 @@ entry: define void @sincos_v16f64(<16 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v16f64 -; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <16 x double>, align 128, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract <16 x double> @_Z6sincosDv16_dPU3AS5S_(<16 x double> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -666,7 +666,7 @@ bb1: define float @select_sin_or_cos_f32(i1 %cond, float %x) { ; CHECK-LABEL: define float @select_sin_or_cos_f32 -; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { +; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -685,7 +685,7 @@ declare void @func(ptr addrspace(1)) define void @sincos_f32_value_is_instr(ptr addrspace(1) %value.ptr, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f32_value_is_instr -; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: tail call void @func(ptr addrspace(1) [[VALUE_PTR]]) @@ -838,7 +838,7 @@ entry: define void @sincos_v2f32_flag_intersect1(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_v2f32_flag_intersect1 -; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call nnan contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -859,7 +859,7 @@ declare void @use_stack_ptrs(ptr addrspace(5), ptr addrspace(5)) define void @sincos_f32_alloca_insertpt(float %x) { ; CHECK-LABEL: define void @sincos_f32_alloca_insertpt -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[ALLOCA1:%.*]] = alloca i32, align 4, addrspace(5) @@ -884,7 +884,7 @@ entry: define float @sincos_f32_unused_result_cos(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_cos -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SIN:%.*]] = tail call contract float @_Z3sinf(float [[X]]) ; CHECK-NEXT: ret float [[SIN]] @@ -899,7 +899,7 @@ entry: define float @sincos_f32_unused_result_sin(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_sin -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COS:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: ret float [[COS]] @@ -914,7 +914,7 @@ entry: define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) { ; CHECK-LABEL: define void @sincos_f32_repeated_uses -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) @@ -947,7 +947,7 @@ entry: define void @sin_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) { ; CHECK-LABEL: define void @sin_f32_indirect_call_user -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]]) ; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4 @@ -965,7 +965,7 @@ entry: define void @cos_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) { ; CHECK-LABEL: define void @cos_f32_indirect_call_user -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[COS_OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 9ec8e425a3f55c..bd61558905f634 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -37,9 +37,9 @@ ; by 4 bytes. ; HSA-ALLOCA: .amdhsa_private_segment_fixed_size 24 -; HSA-ALLOCA: s_add_i32 s12, s12, s17 -; HSA-ALLOCA-DAG: s_mov_b32 flat_scratch_lo, s13 -; HSA-ALLOCA-DAG: s_lshr_b32 flat_scratch_hi, s12, 8 +; HSA-ALLOCA: s_add_i32 s6, s6, s9 +; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7 +; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll index 8cda553e61c8ad..cc116dfe807ecd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll @@ -9,8 +9,8 @@ ; Legacy intrinsics that just read implicit parameters ; FUNC-LABEL: {{^}}ngroups_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x0 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -24,8 +24,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -39,8 +39,8 @@ entry: } ; FUNC-LABEL: {{^}}ngroups_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x2 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -54,8 +54,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x3 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0xc +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -69,8 +69,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x4 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x10 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -84,8 +84,8 @@ entry: } ; FUNC-LABEL: {{^}}global_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x5 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x14 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -99,8 +99,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_x: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x6 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x18 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -114,8 +114,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_y: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x7 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x1c +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] @@ -129,8 +129,8 @@ entry: } ; FUNC-LABEL: {{^}}local_size_z: -; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x8 -; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[2:3], 0x20 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN-NOHSA: buffer_store_dword [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll index 91abbfff7f2dee..87084d780410b1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll @@ -23,7 +23,7 @@ ; ELF: Section: .text (0x2) ; ELF: } -; GFX10: NumSGPRsForWavesPerEU: 4 +; GFX10: NumSGPRsForWavesPerEU: 2 ; GFX10: NumVGPRsForWavesPerEU: 1 define amdgpu_kernel void @simple(ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 1f8da18cdd3014..897e134ee48d83 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -9,8 +9,8 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GCN-LABEL: anyext_i1_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -22,8 +22,8 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX8-LABEL: anyext_i1_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -37,17 +37,17 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; ; GFX9-LABEL: anyext_i1_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp = icmp eq i32 %cond, 0 @@ -62,8 +62,8 @@ entry: define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; GCN-LABEL: s_anyext_i16_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s11 @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: s_anyext_i16_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s7 @@ -113,13 +113,13 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: s_anyext_i16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v3, v1, s[0:1] +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index b1134ae78cb979..cff408c8f01b4c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -24,18 +24,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -52,18 +52,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -80,18 +80,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -107,10 +107,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -118,10 +118,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -131,25 +130,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -160,7 +158,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -168,7 +166,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -176,8 +174,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -191,24 +189,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -223,7 +221,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -231,7 +229,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -239,8 +237,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -254,24 +252,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -292,23 +290,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -321,24 +319,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -351,24 +349,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -380,16 +378,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -397,10 +395,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -410,37 +407,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -448,7 +445,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -456,8 +453,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -471,41 +468,41 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -513,7 +510,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -521,8 +518,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -536,32 +533,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -574,18 +571,18 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_mov_b32 m0, s5 ; GFX6-NEXT: v_readlane_b32 s8, v0, s5 ; GFX6-NEXT: v_writelane_b32 v1, s4, m0 ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 @@ -594,17 +591,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB2_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB2_4: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -617,36 +614,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -658,36 +655,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -698,38 +695,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -739,37 +735,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -779,182 +774,174 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -968,18 +955,18 @@ entry: define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) { ; GFX6-LABEL: struct_add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB3_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_mov_b32 m0, s5 ; GFX6-NEXT: v_readlane_b32 s8, v0, s5 ; GFX6-NEXT: v_writelane_b32 v1, s4, m0 ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB3_1 @@ -988,19 +975,19 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB3_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dword s5, s[0:1], 0x11 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB3_4: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1013,38 +1000,38 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB3_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB3_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -1056,38 +1043,38 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB3_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1098,41 +1085,40 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W64-NEXT: ; %bb.3: ; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dword s5, s[2:3], 0x44 -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mov_b32_e32 v2, s5 ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB3_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1142,40 +1128,39 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX10W32-NEXT: ; %bb.3: ; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dword s8, s[2:3], 0x44 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: v_mov_b32_e32 v2, s8 ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB3_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -1185,192 +1170,186 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W64-NEXT: ; %bb.3: ; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b32 s5, s[2:3], 0x44 -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB3_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W32-NEXT: ; %bb.3: ; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b32 s8, s[2:3], 0x44 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB3_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: struct_add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W64-NEXT: ; %bb.3: ; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44 -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b32 s5, s[0:1], 0x44 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, s5 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB3_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: struct_add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX12W32-NEXT: ; %bb.3: ; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b32 s8, s[2:3], 0x44 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: v_mov_b32_e32 v2, s8 +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB3_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1384,8 +1363,8 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc @@ -1397,9 +1376,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1410,9 +1389,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1422,10 +1401,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1433,67 +1411,33 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: add_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: add_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: add_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: add_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: add_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) @@ -1509,18 +1453,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1538,18 +1482,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1567,18 +1511,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1595,10 +1539,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1606,10 +1550,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1620,25 +1563,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1650,7 +1592,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1658,7 +1600,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1666,8 +1608,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1682,24 +1624,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1715,7 +1657,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1723,7 +1665,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1731,8 +1673,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1747,24 +1689,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1786,23 +1728,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1815,24 +1757,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1845,24 +1787,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1874,16 +1816,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1891,8 +1833,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1904,38 +1846,38 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1943,7 +1885,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1951,8 +1893,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1967,42 +1909,42 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2010,7 +1952,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -2018,8 +1960,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -2034,33 +1976,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2073,18 +2015,18 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_mov_b32 m0, s5 ; GFX6-NEXT: v_readlane_b32 s8, v0, s5 ; GFX6-NEXT: v_writelane_b32 v1, s4, m0 ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 @@ -2093,17 +2035,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB7_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB7_4: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2116,36 +2058,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB7_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -2157,36 +2099,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2197,38 +2139,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2238,37 +2179,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2278,184 +2218,176 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB7_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB7_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2469,8 +2401,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc @@ -2482,9 +2414,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2495,9 +2427,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2507,10 +2439,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2518,73 +2449,36 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: sub_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: sub_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: sub_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: sub_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: sub_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sub_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index bc5d2662dcb45f..980f59cacaeb1e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -26,7 +26,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -89,7 +89,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -121,7 +121,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -225,7 +225,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -261,7 +261,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -298,7 +298,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -340,25 +340,25 @@ entry: define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) { ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s8, s6 -; GFX7LESS-NEXT: s_mov_b32 s9, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 -; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB1_2: @@ -367,36 +367,36 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -408,29 +408,29 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -443,20 +443,20 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s10, s2 +; GFX1064-NEXT: s_mul_i32 s2, s8, s2 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b32 s12, s6 @@ -472,28 +472,28 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc @@ -502,38 +502,38 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_mul_i32 s2, s8, s2 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv @@ -544,7 +544,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1] +; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] ; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -553,17 +553,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 @@ -591,27 +591,27 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: add_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB1_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: s_mul_i32 s2, s8, s2 +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s2 +; GFX1264-NEXT: s_mov_b32 s12, s6 +; GFX1264-NEXT: s_mov_b32 s13, s7 ; GFX1264-NEXT: global_wb scope:SCOPE_DEV -; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: @@ -621,7 +621,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1] +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[0:1] ; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -630,17 +630,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: add_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB1_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 @@ -673,23 +673,23 @@ entry: define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -720,22 +720,22 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_ITERATIVE-LABEL: add_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -765,22 +765,22 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: add_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -810,21 +810,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_ITERATIVE-LABEL: add_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -856,21 +856,21 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_ITERATIVE-LABEL: add_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s6 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 @@ -901,51 +901,49 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: add_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB2_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 @@ -954,49 +952,47 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_ITERATIVE-LABEL: add_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB2_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 @@ -1005,51 +1001,49 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: add_i32_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB2_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264_ITERATIVE-NEXT: s_nop 0 @@ -1058,49 +1052,47 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: add_i32_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB2_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232_ITERATIVE-NEXT: s_nop 0 @@ -1109,7 +1101,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: add_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 @@ -1127,7 +1119,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: add_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -1180,7 +1172,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: add_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -1237,7 +1229,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1251,8 +1243,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -1301,15 +1293,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -1351,33 +1343,32 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -1426,38 +1417,36 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -1490,33 +1479,32 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: add_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry -; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -1565,38 +1553,36 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_DPP-LABEL: add_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry -; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -1637,7 +1623,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1675,7 +1661,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1712,7 +1698,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1749,7 +1735,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1785,7 +1771,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1820,7 +1806,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1859,7 +1845,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1896,7 +1882,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: add_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1936,7 +1922,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: add_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 @@ -1981,8 +1967,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -2025,8 +2011,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -2065,24 +2051,24 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -2091,38 +2077,38 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 +; GFX1064-NEXT: s_mul_i32 s9, s3, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX1064-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 @@ -2135,37 +2121,37 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v2, s[2:3] -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s1, v2, v[1:2] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1] +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 +; GFX1032-NEXT: s_mul_i32 s8, s3, s1 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 @@ -2176,22 +2162,22 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v2, s[2:3] -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s1, v2, v[1:2] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1] +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2] ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -2237,17 +2223,17 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 @@ -2283,8 +2269,8 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: add_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -2326,18 +2312,18 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: add_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB4_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] @@ -2372,13 +2358,13 @@ entry: define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS_ITERATIVE-LABEL: add_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 @@ -2387,12 +2373,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB5_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2427,13 +2413,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_ITERATIVE-LABEL: add_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -2442,11 +2428,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 ; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2480,13 +2466,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: add_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -2495,11 +2481,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 ; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -2534,12 +2520,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-LABEL: add_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 @@ -2547,11 +2533,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -2587,24 +2573,24 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-LABEL: add_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -2638,58 +2624,56 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB5_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 -; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -2699,54 +2683,54 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB5_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 -; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -2756,55 +2740,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: add_i64_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 -; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s10 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB5_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 -; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -2814,51 +2796,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: add_i64_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB5_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 -; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -2868,7 +2850,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: add_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -2887,7 +2869,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -2987,7 +2969,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -3089,17 +3071,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -3144,8 +3126,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3204,17 +3186,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -3247,8 +3229,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo @@ -3297,83 +3279,81 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_DPP-LABEL: add_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -3419,64 +3399,62 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_DPP-LABEL: add_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -3516,83 +3494,80 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: add_i64_varying: ; GFX1264_DPP: ; %bb.0: ; %entry -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v4, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -3638,63 +3613,61 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_DPP-LABEL: add_i64_varying: ; GFX1232_DPP: ; %bb.0: ; %entry -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4 ; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -3743,7 +3716,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -3775,7 +3748,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -3808,7 +3781,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 @@ -3841,7 +3814,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3876,7 +3849,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -3910,7 +3883,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -3948,7 +3921,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -3985,7 +3958,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i32_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -4023,7 +3996,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i32_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s5, exec_lo ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 @@ -4066,25 +4039,25 @@ entry: define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) { ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s8, s6 -; GFX7LESS-NEXT: s_mov_b32 s9, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 -; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB7_2: @@ -4093,36 +4066,36 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -4134,29 +4107,29 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -4169,20 +4142,20 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s10, s2 +; GFX1064-NEXT: s_mul_i32 s2, s8, s2 ; GFX1064-NEXT: s_mov_b32 s14, -1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_mov_b32 s12, s6 @@ -4195,7 +4168,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s10, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -4206,21 +4179,21 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc @@ -4229,9 +4202,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 @@ -4242,33 +4215,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s3, s2, s3 -; GFX1164-NEXT: s_mov_b32 s10, -1 -; GFX1164-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164-NEXT: s_mov_b32 s8, s6 -; GFX1164-NEXT: s_mov_b32 s9, s7 -; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_mul_i32 s2, s8, s2 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s6, -1 @@ -4282,17 +4255,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 @@ -4321,33 +4294,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: sub_i32_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34 -; GFX1264-NEXT: s_mov_b64 s[8:9], exec +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1264-NEXT: s_mov_b64 s[2:3], exec ; GFX1264-NEXT: s_mov_b64 s[0:1], exec -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1264-NEXT: ; implicit-def: $vgpr1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264-NEXT: s_cbranch_execz .LBB7_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_i32 s3, s2, s3 -; GFX1264-NEXT: s_mov_b32 s10, -1 -; GFX1264-NEXT: v_mov_b32_e32 v1, s3 -; GFX1264-NEXT: s_mov_b32 s8, s6 -; GFX1264-NEXT: s_mov_b32 s9, s7 +; GFX1264-NEXT: s_mul_i32 s2, s8, s2 +; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s2 +; GFX1264-NEXT: s_mov_b32 s12, s6 +; GFX1264-NEXT: s_mov_b32 s13, s7 ; GFX1264-NEXT: global_wb scope:SCOPE_DEV -; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB7_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 @@ -4361,17 +4334,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: sub_i32_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s8, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s1, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232-NEXT: s_cbranch_execz .LBB7_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_i32 s2, s0, s2 @@ -4405,23 +4378,23 @@ entry: define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s4 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB8_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4452,22 +4425,22 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_ITERATIVE-LABEL: sub_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s4 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4497,22 +4470,22 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: sub_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s4 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s4 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0 ; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4542,21 +4515,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_ITERATIVE-LABEL: sub_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s6, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4588,21 +4561,21 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_ITERATIVE-LABEL: sub_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s1 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s6 ; GFX1032_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 @@ -4633,51 +4606,49 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: sub_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s6, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB8_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 @@ -4686,49 +4657,47 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_ITERATIVE-LABEL: sub_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1132_ITERATIVE-NEXT: s_add_i32 s4, s4, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB8_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 @@ -4737,51 +4706,49 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: sub_i32_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s6, 0 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7 +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[2:3] +; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7 -; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8 -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v0, s6 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB8_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264_ITERATIVE-NEXT: s_nop 0 @@ -4790,49 +4757,47 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: sub_i32_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s3 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s6 ; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5 -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1 +; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0 ; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v1, s4 +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB8_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232_ITERATIVE-NEXT: s_nop 0 @@ -4841,7 +4806,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: sub_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: s_mov_b32 s10, s6 @@ -4859,7 +4824,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: sub_i32_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -4912,7 +4877,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: sub_i32_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -4969,7 +4934,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4983,8 +4948,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -5033,15 +4998,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 @@ -5083,33 +5048,32 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -5158,38 +5122,36 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 @@ -5222,33 +5184,32 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: sub_i32_varying: ; GFX1264_DPP: ; %bb.0: ; %entry -; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16 @@ -5297,38 +5258,36 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_DPP-LABEL: sub_i32_varying: ; GFX1232_DPP: ; %bb.0: ; %entry -; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 @@ -5369,7 +5328,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -5407,7 +5366,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5445,7 +5404,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5483,7 +5442,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5522,7 +5481,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -5560,7 +5519,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -5602,7 +5561,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -5642,7 +5601,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1264-LABEL: sub_i64_constant: ; GFX1264: ; %bb.0: ; %entry -; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -5685,7 +5644,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX1232-LABEL: sub_i64_constant: ; GFX1232: ; %bb.0: ; %entry -; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232-NEXT: s_mov_b32 s4, exec_lo ; GFX1232-NEXT: s_mov_b32 s5, 0 ; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 @@ -5733,8 +5692,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -5777,8 +5736,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 @@ -5818,24 +5777,24 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s6 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s0, s6 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -5844,12 +5803,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 @@ -5862,22 +5821,22 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s9, s1, s8 -; GFX1064-NEXT: s_mul_hi_u32 s10, s0, s8 -; GFX1064-NEXT: s_mul_i32 s8, s0, s8 +; GFX1064-NEXT: s_mul_i32 s9, s3, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX1064-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s10 @@ -5890,12 +5849,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s0, v2, 0 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s1, v2, v[4:5] +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 @@ -5907,23 +5866,23 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s8, s1, s3 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1032-NEXT: s_mul_i32 s3, s0, s3 +; GFX1032-NEXT: s_mul_i32 s8, s3, s1 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 @@ -5934,14 +5893,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s0, v2, 0 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s1, v2, v[4:5] +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5] ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo @@ -5951,8 +5910,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -6000,17 +5959,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s8 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 @@ -6048,8 +6007,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-LABEL: sub_i64_uniform: ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_clause 0x1 -; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec ; GFX1264-NEXT: s_mov_b32 s11, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 @@ -6095,18 +6054,18 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-LABEL: sub_i64_uniform: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_clause 0x1 -; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX1232-NEXT: s_mov_b32 s9, exec_lo +; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1232-NEXT: s_mov_b32 s2, exec_lo ; GFX1232-NEXT: s_mov_b32 s3, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1232-NEXT: s_mov_b32 s8, exec_lo ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB10_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9 +; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1232-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3] @@ -6145,13 +6104,13 @@ entry: define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) { ; GFX7LESS_ITERATIVE-LABEL: sub_i64_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 @@ -6160,12 +6119,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB11_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6200,13 +6159,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_ITERATIVE-LABEL: sub_i64_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -6215,11 +6174,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 ; GFX8_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6253,13 +6212,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_ITERATIVE-LABEL: sub_i64_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 @@ -6268,11 +6227,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s5, m0 ; GFX9_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6307,12 +6266,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-LABEL: sub_i64_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 @@ -6320,11 +6279,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 ; GFX1064_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -6360,24 +6319,24 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-LABEL: sub_i64_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s1 -; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s1 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3 ; GFX1032_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 ; GFX1032_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -6411,58 +6370,56 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s4, s4, s7 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s5, s5, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc +; GFX1164_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB11_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -6472,54 +6429,54 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_ITERATIVE-LABEL: sub_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s4, s4, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s5, s5, s7 -; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s9, s3 -; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], 0 glc +; GFX1132_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB11_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -6529,55 +6486,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: sub_i64_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1] +; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s10 -; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s10 -; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9] +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s10 +; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s10 +; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] -; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1264_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], exec +; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1264_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1264_ITERATIVE-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1264_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1264_ITERATIVE-NEXT: ; %bb.3: -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v4, s5 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1264_ITERATIVE-NEXT: .LBB11_4: ; GFX1264_ITERATIVE-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1264_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -6587,51 +6542,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: sub_i64_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 -; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo +; GFX1232_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[4:5], 0 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 +; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1 -; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s5, s1 -; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1 -; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 +; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v2, s5, s3 +; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s3 +; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8 ; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] -; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232_ITERATIVE-NEXT: s_mov_b32 s6, exec_lo +; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1232_ITERATIVE-NEXT: s_xor_b32 s6, exec_lo, s6 ; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4 ; GFX1232_ITERATIVE-NEXT: ; %bb.3: -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3 ; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV -; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[3:4], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0 ; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV ; GFX1232_ITERATIVE-NEXT: .LBB11_4: ; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0 -; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1232_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -6641,7 +6596,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX7LESS_DPP-LABEL: sub_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS_DPP-NEXT: s_mov_b32 s6, -1 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -6660,7 +6615,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6760,7 +6715,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -6862,17 +6817,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -6917,8 +6872,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6977,17 +6932,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -7020,8 +6975,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032_DPP-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo @@ -7070,83 +7025,81 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_DPP-LABEL: sub_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -7192,64 +7145,62 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_DPP-LABEL: sub_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -7289,83 +7240,80 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_DPP-LABEL: sub_i64_varying: ; GFX1264_DPP: ; %bb.0: ; %entry -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec +; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1264_DPP-NEXT: s_not_b64 exec, exec -; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5 +; GFX1264_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31 -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1264_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1264_DPP-NEXT: v_readlane_b32 s8, v4, 31 -; GFX1264_DPP-NEXT: v_readlane_b32 s9, v3, 31 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 15 +; GFX1264_DPP-NEXT: v_readlane_b32 s8, v3, 31 +; GFX1264_DPP-NEXT: v_readlane_b32 s9, v4, 31 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s6, v4, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s7, 16 -; GFX1264_DPP-NEXT: v_readlane_b32 s10, v4, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63 +; GFX1264_DPP-NEXT: v_readlane_b32 s10, v3, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s11, v4, 47 +; GFX1264_DPP-NEXT: v_readlane_b32 s7, v4, 63 ; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5] @@ -7411,63 +7359,61 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_DPP-LABEL: sub_i64_varying: ; GFX1232_DPP: ; %bb.0: ; %entry -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2 -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX1232_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1232_DPP-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4 ; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15 -; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo +; GFX1232_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_readlane_b32 s7, v3, 15 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1232_DPP-NEXT: v_readlane_b32 s8, v4, 15 +; GFX1232_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 1439d4b40c951c..dc7b79a903a3d9 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -29,7 +29,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -40,8 +40,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB0_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -57,7 +57,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -68,8 +68,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -85,7 +85,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -95,8 +95,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -112,7 +112,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -124,9 +124,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -137,25 +136,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s3 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -167,7 +165,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -184,8 +182,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -199,24 +197,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB0_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -237,12 +235,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -254,8 +252,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -267,13 +265,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -285,8 +283,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -298,13 +296,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -315,8 +313,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -328,13 +326,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -347,9 +345,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -360,40 +357,39 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s0, s4 +; GFX1032-NEXT: s_mul_i32 s4, s2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -411,8 +407,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB1_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -426,9 +422,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -437,22 +433,22 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s0, s4 +; GFX1132-NEXT: s_mul_i32 s4, s2, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB1_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] -; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -465,18 +461,18 @@ entry: define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: add_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB2_1 @@ -485,8 +481,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -495,8 +491,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB2_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -507,27 +503,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: add_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -536,8 +532,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB2_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -548,27 +544,27 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: add_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -576,8 +572,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB2_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -588,26 +584,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: add_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -617,9 +613,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB2_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -630,37 +625,36 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: add_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB2_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 @@ -671,45 +665,43 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: add_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB2_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -719,42 +711,41 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: add_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB2_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -764,7 +755,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: add_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -784,7 +775,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -800,10 +791,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 @@ -811,8 +802,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB2_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -831,7 +822,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -847,18 +838,18 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB2_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -874,7 +865,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -890,38 +881,39 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB2_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i32_varying: @@ -930,7 +922,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -939,154 +931,155 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB2_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB2_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB2_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -1244,17 +1237,16 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1164_ITERATIVE-LABEL: add_i32_varying_nouse: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1277,17 +1269,16 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1132_ITERATIVE-LABEL: add_i32_varying_nouse: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1 @@ -1447,29 +1438,27 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1164_DPP-LABEL: add_i32_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -1489,29 +1478,27 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX1132_DPP-LABEL: add_i32_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -1534,7 +1521,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1545,8 +1532,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1567,7 +1554,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1578,10 +1565,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1599,7 +1586,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1609,10 +1596,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] @@ -1630,7 +1617,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1642,9 +1629,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] @@ -1656,25 +1642,24 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] @@ -1687,7 +1672,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -1704,8 +1689,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1720,25 +1705,25 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1760,7 +1745,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1801,7 +1786,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1839,7 +1824,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX9-LABEL: add_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1877,7 +1862,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1064-LABEL: add_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -1913,7 +1898,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1032-LABEL: add_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -1948,7 +1933,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1164-LABEL: add_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -1989,7 +1974,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; ; GFX1132-LABEL: add_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -2037,7 +2022,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2045,10 +2030,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s2, s2, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 @@ -2064,14 +2049,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB6_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -2087,7 +2072,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2095,10 +2080,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2112,15 +2097,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB6_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -2136,7 +2121,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2144,10 +2129,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2161,14 +2146,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB6_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -2184,17 +2169,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 -; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 -; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s2, s2, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2208,17 +2193,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB6_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 @@ -2233,17 +2217,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s2, s2, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 @@ -2256,17 +2240,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB6_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 @@ -2279,52 +2262,50 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 -; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s2, s2, s7 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB6_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 -; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2335,48 +2316,48 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: s_add_u32 s2, s2, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB6_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 -; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2387,7 +2368,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: add_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2404,17 +2385,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -2466,10 +2447,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -2478,8 +2459,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB6_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 @@ -2499,17 +2480,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -2561,10 +2542,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -2572,8 +2553,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB6_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 @@ -2591,17 +2572,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -2646,9 +2627,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 @@ -2663,54 +2644,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB6_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v9 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s3, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s4, v10, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -2745,246 +2725,241 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB6_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v9 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s3, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s4, v10, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: add_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s4 ; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB6_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v8 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s3, v8 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s4, v9, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: add_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB6_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v8 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v9, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -3164,7 +3139,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 @@ -3202,21 +3176,21 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying_nouse: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -3521,67 +3495,65 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1164_DPP-LABEL: add_i64_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v3 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v3, v2 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164_DPP-NEXT: ; %bb.1: @@ -3593,28 +3565,27 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1132_DPP-LABEL: add_i64_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf @@ -3667,7 +3638,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3678,8 +3649,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB8_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -3696,7 +3667,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3707,8 +3678,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -3725,7 +3696,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3735,8 +3706,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -3753,7 +3724,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3765,9 +3736,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -3779,25 +3749,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v2, s3 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -3810,7 +3779,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3827,8 +3796,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -3843,24 +3812,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s1 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -3882,12 +3851,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dword s6, s[2:3], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3899,8 +3868,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB9_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -3912,13 +3881,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3930,8 +3899,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 @@ -3943,13 +3912,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3960,8 +3929,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 @@ -3973,13 +3942,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -3992,8 +3961,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -4005,40 +3974,40 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s4, s0, s4 +; GFX1032-NEXT: s_mul_i32 s4, s2, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b32 s6, s[2:3], 0x2c +; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4056,8 +4025,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 @@ -4072,9 +4041,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4083,23 +4052,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s4, s0, s4 +; GFX1132-NEXT: s_mul_i32 s4, s2, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -4112,18 +4081,18 @@ entry: define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: sub_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB10_1 @@ -4132,8 +4101,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -4142,8 +4111,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB10_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -4154,27 +4123,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: sub_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -4183,8 +4152,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB10_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -4195,27 +4164,27 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: sub_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -4223,8 +4192,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB10_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -4235,26 +4204,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: sub_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -4264,9 +4233,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB10_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -4277,37 +4245,36 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: sub_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_add_i32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB10_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 @@ -4318,45 +4285,43 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: sub_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_add_i32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB10_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4366,42 +4331,41 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: sub_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_add_i32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB10_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB10_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -4411,7 +4375,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: sub_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4431,7 +4395,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -4447,10 +4411,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 @@ -4458,8 +4422,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB10_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -4478,7 +4442,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -4494,18 +4458,18 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB10_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -4521,7 +4485,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4537,38 +4501,39 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB10_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i32_varying: @@ -4577,7 +4542,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4586,154 +4551,155 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB10_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB10_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB10_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -4891,17 +4857,16 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1164_ITERATIVE-LABEL: sub_i32_varying_nouse: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 @@ -4924,17 +4889,16 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1132_ITERATIVE-LABEL: sub_i32_varying_nouse: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 ; GFX1132_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1 @@ -5094,29 +5058,27 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1164_DPP-LABEL: sub_i32_varying_nouse: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) @@ -5136,29 +5098,27 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX1132_DPP-LABEL: sub_i32_varying_nouse: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132_DPP-NEXT: ; %bb.1: @@ -5181,7 +5141,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -5192,8 +5152,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB12_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -5214,7 +5174,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB12_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -5225,8 +5185,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB12_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5247,7 +5207,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -5257,8 +5217,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB12_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -5279,7 +5239,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -5291,9 +5251,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -5308,25 +5267,24 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_mov_b32 s1, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s1, s1, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -5342,7 +5300,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -5359,8 +5317,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB12_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -5378,25 +5336,25 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) { ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s1, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s1, s1, 5 +; GFX1132-NEXT: s_mul_i32 s3, s3, 5 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB12_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -5421,7 +5379,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -5462,7 +5420,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5501,7 +5459,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX9-LABEL: sub_i64_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5541,7 +5499,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1064-LABEL: sub_i64_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 @@ -5580,7 +5538,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1032-LABEL: sub_i64_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -5618,7 +5576,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1164-LABEL: sub_i64_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 @@ -5661,7 +5619,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; ; GFX1132-LABEL: sub_i64_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 @@ -5711,7 +5669,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5719,10 +5677,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX7LESS_ITERATIVE-NEXT: s_add_u32 s2, s2, s8 +; GFX7LESS_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 @@ -5738,14 +5696,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB14_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -5761,7 +5719,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5769,10 +5727,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_add_u32 s2, s2, s8 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -5786,15 +5744,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB14_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -5810,7 +5768,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5818,10 +5776,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s6 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_add_u32 s0, s0, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_add_u32 s2, s2, s8 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -5835,14 +5793,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB14_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s4 @@ -5858,17 +5816,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6 -; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 -; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1064_ITERATIVE-NEXT: s_add_u32 s2, s2, s7 +; GFX1064_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -5882,17 +5840,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB14_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 @@ -5907,17 +5864,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 -; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: s_add_u32 s2, s2, s6 +; GFX1032_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 @@ -5930,17 +5887,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB14_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 @@ -5953,52 +5909,50 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 -; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s6 +; GFX1164_ITERATIVE-NEXT: s_add_u32 s2, s2, s7 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8 +; GFX1164_ITERATIVE-NEXT: s_addc_u32 s3, s3, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB14_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6009,48 +5963,48 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: sub_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 -; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: s_add_u32 s2, s2, s6 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132_ITERATIVE-NEXT: s_addc_u32 s3, s3, s7 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB14_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6061,7 +6015,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: sub_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6078,17 +6032,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -6140,10 +6094,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -6152,8 +6106,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB14_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 @@ -6173,17 +6127,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -6235,10 +6189,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -6246,8 +6200,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB14_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 @@ -6265,17 +6219,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -6320,9 +6274,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 ; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 @@ -6337,54 +6291,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v4, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB14_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v9 -; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s3, v9 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s4, v10, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -6419,246 +6372,241 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB14_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v9 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s3, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s4, v10, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: sub_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v7, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v5, vcc +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v4, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s4 ; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB14_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v8 -; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s3, v8 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s4, v9, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: sub_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v6, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v3, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB14_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v8 -; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v9, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -6673,18 +6621,18 @@ entry: define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: and_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB15_1 @@ -6693,8 +6641,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -6703,8 +6651,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB15_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -6715,27 +6663,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: and_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -6744,8 +6692,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB15_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -6756,27 +6704,27 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: and_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -6784,8 +6732,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB15_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -6796,26 +6744,26 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: and_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -6825,9 +6773,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB15_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 @@ -6838,37 +6785,36 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: and_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_and_b32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB15_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 @@ -6879,45 +6825,43 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: and_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB15_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -6927,42 +6871,41 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: and_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_and_b32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB15_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB15_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -6972,7 +6915,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: and_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6987,14 +6930,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7009,10 +6952,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -7021,8 +6964,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB15_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -7036,14 +6979,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7058,10 +7001,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -7069,8 +7012,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB15_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -7086,7 +7029,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7102,38 +7045,39 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB15_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: and_i32_varying: @@ -7142,7 +7086,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7151,155 +7095,155 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB15_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: and_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB15_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1132_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB15_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -7315,7 +7259,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7323,13 +7267,13 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] -; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB16_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -7341,14 +7285,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB16_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -7363,7 +7307,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7372,9 +7316,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 @@ -7387,15 +7331,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB16_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -7410,7 +7354,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7419,9 +7363,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 @@ -7434,14 +7378,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB16_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -7456,7 +7400,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7464,10 +7408,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -7479,17 +7423,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB16_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1064_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 @@ -7504,7 +7447,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7512,10 +7455,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 -; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -7526,17 +7469,16 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB16_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1032_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 @@ -7549,109 +7491,107 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: and_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] -; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB16_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 ; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: and_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 -; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB16_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 ; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: and_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7668,17 +7608,17 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 @@ -7700,10 +7640,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -7712,8 +7652,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB16_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 @@ -7732,17 +7672,17 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 @@ -7764,10 +7704,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -7775,8 +7715,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB16_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 @@ -7799,7 +7739,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 @@ -7822,9 +7762,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 @@ -7839,37 +7779,36 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB16_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: and_i64_varying: @@ -7882,7 +7821,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 @@ -7899,47 +7838,47 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB16_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: and_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 @@ -7948,7 +7887,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7981,10 +7920,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 @@ -7999,45 +7938,44 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB16_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -8045,7 +7983,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -8068,43 +8006,43 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4 ; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB16_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -8119,18 +8057,18 @@ entry: define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: or_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB17_1 @@ -8139,8 +8077,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8149,8 +8087,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB17_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -8161,27 +8099,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: or_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8190,8 +8128,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB17_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -8202,27 +8140,27 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: or_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8230,8 +8168,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB17_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -8242,26 +8180,26 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: or_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -8271,9 +8209,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB17_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 @@ -8284,37 +8221,36 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: or_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_or_b32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB17_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 @@ -8325,45 +8261,43 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: or_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_or_b32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB17_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -8373,42 +8307,41 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: or_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_or_b32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB17_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB17_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -8418,7 +8351,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: or_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8438,7 +8371,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -8454,10 +8387,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 @@ -8465,8 +8398,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB17_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -8485,7 +8418,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -8501,18 +8434,18 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB17_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -8528,7 +8461,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8544,38 +8477,39 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB17_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: or_i32_varying: @@ -8584,7 +8518,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8593,154 +8527,155 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB17_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: or_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB17_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB17_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -8756,7 +8691,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8764,13 +8699,13 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB18_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -8782,14 +8717,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB18_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -8804,7 +8739,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8813,9 +8748,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 @@ -8828,15 +8763,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB18_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -8851,7 +8786,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8860,9 +8795,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 @@ -8875,14 +8810,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB18_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -8897,7 +8832,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8905,10 +8840,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -8920,17 +8855,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB18_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1064_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 @@ -8945,7 +8879,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8953,10 +8887,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 -; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -8967,17 +8901,16 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB18_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1032_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 @@ -8990,109 +8923,107 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: or_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] -; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB18_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 -; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 ; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: or_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 -; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB18_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v1, s2, v1 -; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 ; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: or_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9109,17 +9040,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -9141,10 +9072,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -9153,8 +9084,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB18_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 @@ -9173,17 +9104,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -9205,10 +9136,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -9216,8 +9147,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB18_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 @@ -9240,7 +9171,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -9263,9 +9194,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 @@ -9280,37 +9211,36 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB18_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s4, v7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: or_i64_varying: @@ -9323,7 +9253,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -9340,47 +9270,47 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB18_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s4, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: or_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 @@ -9389,7 +9319,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9422,10 +9352,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 @@ -9440,45 +9370,44 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB18_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s4, v7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -9486,7 +9415,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9509,43 +9438,43 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4 ; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB18_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s3, v8 +; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s4, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -9560,18 +9489,18 @@ entry: define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: xor_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB19_1 @@ -9580,8 +9509,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -9590,8 +9519,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB19_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -9602,27 +9531,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: xor_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -9631,8 +9560,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB19_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -9643,27 +9572,27 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: xor_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -9671,8 +9600,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB19_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -9683,26 +9612,26 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: xor_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -9712,9 +9641,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB19_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -9725,37 +9653,36 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: xor_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB19_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 @@ -9766,45 +9693,43 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: xor_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_xor_b32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB19_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -9814,42 +9739,41 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: xor_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB19_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB19_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -9859,7 +9783,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: xor_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9879,7 +9803,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -9895,10 +9819,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 @@ -9906,8 +9830,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB19_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -9926,7 +9850,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -9942,18 +9866,18 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB19_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -9969,7 +9893,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9985,38 +9909,39 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB19_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: xor_i32_varying: @@ -10025,7 +9950,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10034,154 +9959,155 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB19_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: xor_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB19_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB19_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -10197,7 +10123,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10205,13 +10131,13 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[8:9], s[4:5], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[8:9] -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB20_1 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 @@ -10223,14 +10149,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB20_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 @@ -10245,7 +10171,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10254,9 +10180,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 @@ -10269,15 +10195,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB20_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -10292,7 +10218,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10301,9 +10227,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 @@ -10316,14 +10242,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB20_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 @@ -10338,7 +10264,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10346,10 +10272,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[8:9] -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -10361,17 +10287,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB20_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1064_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 @@ -10386,7 +10311,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10394,10 +10319,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s8 -; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -10408,17 +10333,16 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB20_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1032_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 @@ -10431,109 +10355,107 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: xor_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[8:9] -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB20_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 ; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: xor_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s8 -; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB20_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v4 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v1, s3, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 ; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: xor_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10550,17 +10472,17 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 @@ -10582,10 +10504,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -10594,8 +10516,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB20_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 @@ -10614,17 +10536,17 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 @@ -10646,10 +10568,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 @@ -10657,8 +10579,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB20_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 @@ -10681,7 +10603,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -10704,9 +10626,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s5 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 @@ -10721,37 +10643,36 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB20_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: xor_i64_varying: @@ -10764,7 +10685,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 @@ -10781,47 +10702,47 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB20_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: xor_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7 @@ -10830,7 +10751,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10863,10 +10784,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 @@ -10881,45 +10802,44 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s4 ; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB20_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8 @@ -10927,7 +10847,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10950,43 +10870,43 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4 ; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB20_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v10 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s3, v8 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -11001,18 +10921,18 @@ entry: define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: max_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, 1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB21_1 @@ -11021,8 +10941,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -11031,8 +10951,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB21_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -11043,27 +10963,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: max_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, 1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -11072,8 +10992,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB21_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -11084,27 +11004,27 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: max_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, 1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -11112,8 +11032,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB21_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -11124,26 +11044,26 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: max_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, 1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -11153,9 +11073,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB21_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 @@ -11166,37 +11085,36 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: max_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, 1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s2, 1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_max_i32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB21_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 @@ -11207,45 +11125,43 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: max_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, 1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_max_i32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB21_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -11255,42 +11171,41 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: max_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, 1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s2, 1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_max_i32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB21_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB21_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -11300,7 +11215,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: max_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11315,14 +11230,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11337,10 +11252,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -11349,8 +11264,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB21_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -11364,14 +11279,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11386,10 +11301,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -11397,8 +11312,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB21_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -11414,7 +11329,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11430,38 +11345,39 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB21_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: max_i32_varying: @@ -11470,7 +11386,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11479,155 +11395,155 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB21_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: max_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB21_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: max_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1132_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB21_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -11645,7 +11561,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -11655,8 +11571,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB22_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -11679,7 +11595,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -11689,10 +11605,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB22_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -11713,7 +11629,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -11722,10 +11638,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB22_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc @@ -11746,7 +11662,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -11757,9 +11673,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -11778,7 +11693,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -11789,9 +11704,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -11812,7 +11726,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB22_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -11822,8 +11736,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB22_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -11846,7 +11760,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -11855,8 +11769,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB22_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -11883,8 +11797,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, 1 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s3, 1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11892,14 +11806,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 @@ -11915,14 +11829,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB23_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -11940,8 +11854,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, 1 -; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s3, 1 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11950,13 +11864,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -11970,15 +11884,15 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB23_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] @@ -11996,8 +11910,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, 1 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s3, 1 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12006,13 +11920,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -12026,14 +11940,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB23_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2] @@ -12051,20 +11965,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, 1 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s3, 1 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -12078,17 +11992,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB23_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2] @@ -12104,20 +12017,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, 1 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s3, 1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[2:3], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 @@ -12130,17 +12043,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB23_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2] @@ -12154,125 +12066,123 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: max_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s3, 1 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB23_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 ; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: max_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s3, 1 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[2:3], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB23_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 ; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: max_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -12286,20 +12196,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: max_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: s_mov_b32 s0, 0 +; GFX8_DPP-NEXT: s_mov_b32 s2, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: s_brev_b32 s1, 1 +; GFX8_DPP-NEXT: s_brev_b32 s3, 1 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s3 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 @@ -12348,15 +12258,15 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s1, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -12365,9 +12275,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB23_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 @@ -12385,20 +12295,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: max_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: s_mov_b32 s0, 0 +; GFX9_DPP-NEXT: s_mov_b32 s2, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: s_brev_b32 s1, 1 +; GFX9_DPP-NEXT: s_brev_b32 s3, 1 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s3 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 @@ -12447,15 +12357,15 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s1, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s0, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -12463,9 +12373,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB23_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 @@ -12483,20 +12393,20 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: max_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1064_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1064_DPP-NEXT: s_brev_b32 s3, 1 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s3 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12547,74 +12457,73 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB23_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: max_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1032_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032_DPP-NEXT: s_brev_b32 s3, 1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s3 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12654,76 +12563,73 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB23_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: max_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1164_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s2, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1164_DPP-NEXT: s_brev_b32 s3, 1 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s3 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf @@ -12776,157 +12682,150 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB23_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: max_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1132_DPP-NEXT: s_brev_b32 s1, 1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0 +; GFX1132_DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132_DPP-NEXT: s_brev_b32 s3, 1 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 ; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB23_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -12941,18 +12840,18 @@ entry: define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: min_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s4, -2 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB24_1 @@ -12961,8 +12860,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -12971,8 +12870,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB24_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -12983,27 +12882,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: min_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_brev_b32 s4, -2 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -13012,8 +12911,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB24_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -13024,27 +12923,27 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: min_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_brev_b32 s4, -2 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -13052,8 +12951,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB24_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -13064,26 +12963,26 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: min_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_brev_b32 s4, -2 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -13093,9 +12992,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB24_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 @@ -13106,37 +13004,36 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: min_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_brev_b32 s0, -2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s2, -2 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_min_i32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB24_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 @@ -13147,45 +13044,43 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: min_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s4, -2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_min_i32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB24_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -13195,42 +13090,41 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: min_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_brev_b32 s0, -2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s2, -2 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB24_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_min_i32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB24_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB24_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -13240,7 +13134,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: min_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -13255,14 +13149,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13277,10 +13171,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -13289,8 +13183,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB24_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -13304,14 +13198,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13326,10 +13220,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -13337,8 +13231,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB24_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -13354,7 +13248,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13370,38 +13264,39 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB24_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: min_i32_varying: @@ -13410,7 +13305,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13419,155 +13314,155 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB24_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: min_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB24_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: min_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1132_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB24_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -13585,7 +13480,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB25_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -13595,8 +13490,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB25_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -13619,7 +13514,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB25_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -13629,10 +13524,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB25_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -13653,7 +13548,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB25_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -13662,10 +13557,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB25_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -13686,7 +13581,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB25_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -13697,9 +13592,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB25_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -13718,7 +13612,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB25_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -13729,9 +13623,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB25_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -13752,7 +13645,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB25_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -13762,8 +13655,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB25_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -13786,7 +13679,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB25_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -13795,8 +13688,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB25_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -13823,8 +13716,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s1, -2 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX7LESS_ITERATIVE-NEXT: s_brev_b32 s3, -2 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13832,14 +13725,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 @@ -13855,14 +13748,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB26_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -13880,8 +13773,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_brev_b32 s1, -2 -; GFX8_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX8_ITERATIVE-NEXT: s_brev_b32 s3, -2 +; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13890,13 +13783,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[4:5] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -13910,15 +13803,15 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB26_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] @@ -13936,8 +13829,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_brev_b32 s1, -2 -; GFX9_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX9_ITERATIVE-NEXT: s_brev_b32 s3, -2 +; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13946,13 +13839,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[4:5] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -13966,14 +13859,14 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB26_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2] @@ -13991,20 +13884,20 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_brev_b32 s1, -2 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1064_ITERATIVE-NEXT: s_brev_b32 s3, -2 +; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -14018,17 +13911,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB26_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] @@ -14044,20 +13936,20 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_brev_b32 s1, -2 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: s_brev_b32 s3, -2 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[2:3], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 @@ -14070,17 +13962,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB26_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2] @@ -14094,125 +13985,123 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: min_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_brev_b32 s3, -2 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB26_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 ; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: min_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_brev_b32 s3, -2 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB26_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 -; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[2:3], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB26_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 ; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: min_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -14228,18 +14117,18 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_brev_b32 s7, -2 +; GFX8_DPP-NEXT: s_brev_b32 s3, -2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s3 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 @@ -14288,36 +14177,36 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 ; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB26_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1 +; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: min_i64_varying: @@ -14325,18 +14214,18 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_brev_b32 s7, -2 +; GFX9_DPP-NEXT: s_brev_b32 s3, -2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s3 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 @@ -14385,54 +14274,54 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6] ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s7, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s6 ; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB26_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 -; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1 +; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: min_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_brev_b32 s3, -2 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s3 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14474,82 +14363,82 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s6 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s10, v4, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1064_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB26_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: min_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_brev_b32 s3, -2 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s3 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s3, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14589,75 +14478,72 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s3 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s3, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s3 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB26_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: min_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1164_DPP-NEXT: s_brev_b32 s3, -2 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s3 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf @@ -14698,168 +14584,162 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s3 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s6 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 63 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s3, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s3, v3, 31 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v3, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s10, v4, 47 ; GFX1164_DPP-NEXT: v_readlane_b32 s11, v3, 47 -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 32 +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s3, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB26_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: min_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7 -; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0 +; GFX1132_DPP-NEXT: s_brev_b32 s3, -2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s3, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s3 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s3, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s3 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 ; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB26_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -14874,18 +14754,18 @@ entry: define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: umax_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB27_1 @@ -14894,8 +14774,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -14904,8 +14784,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB27_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -14916,27 +14796,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: umax_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -14945,8 +14825,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB27_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -14957,27 +14837,27 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: umax_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -14985,8 +14865,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB27_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -14997,26 +14877,26 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: umax_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -15026,9 +14906,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB27_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 @@ -15039,37 +14918,36 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: umax_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_max_u32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB27_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 @@ -15080,45 +14958,43 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umax_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_max_u32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB27_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -15128,42 +15004,41 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: umax_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB27_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_max_u32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB27_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB27_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -15173,7 +15048,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: umax_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -15193,7 +15068,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_nop 0 @@ -15209,10 +15084,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 @@ -15220,8 +15095,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB27_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -15240,7 +15115,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_nop 0 @@ -15256,18 +15131,18 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB27_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -15283,7 +15158,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15299,38 +15174,39 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB27_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umax_i32_varying: @@ -15339,7 +15215,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -15348,154 +15224,155 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB27_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umax_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB27_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umax_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB27_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -15513,7 +15390,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB28_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -15523,8 +15400,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB28_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -15546,7 +15423,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -15556,8 +15433,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -15579,7 +15456,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -15588,8 +15465,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB28_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -15611,7 +15488,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB28_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -15622,9 +15499,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB28_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -15643,7 +15519,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB28_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -15654,9 +15530,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB28_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -15677,7 +15552,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB28_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -15687,8 +15562,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB28_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -15711,7 +15586,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB28_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -15720,8 +15595,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB28_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -15748,7 +15623,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15756,14 +15631,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 @@ -15779,14 +15654,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB29_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -15804,7 +15679,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15813,13 +15688,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -15833,15 +15708,15 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB29_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] @@ -15859,7 +15734,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15868,13 +15743,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -15888,14 +15763,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB29_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2] @@ -15913,19 +15788,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -15939,17 +15814,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB29_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2] @@ -15965,19 +15839,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[2:3], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 @@ -15990,17 +15864,16 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB29_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2] @@ -16014,123 +15887,121 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umax_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB29_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 ; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: umax_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], 0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[2:3], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB29_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 ; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: umax_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -16147,17 +16018,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -16210,10 +16081,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -16222,9 +16093,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB29_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 @@ -16245,17 +16116,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -16308,10 +16179,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -16319,9 +16190,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB29_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 @@ -16339,10 +16210,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: umax_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 @@ -16350,7 +16221,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -16401,72 +16272,71 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB29_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umax_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -16506,74 +16376,72 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB29_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umax_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf @@ -16626,155 +16494,149 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB29_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umax_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 ; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB29_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -16789,18 +16651,18 @@ entry: define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-LABEL: umin_i32_varying: ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX7LESS_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_vccnz .LBB30_1 @@ -16809,8 +16671,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX7LESS_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -16819,8 +16681,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB30_4: -; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 @@ -16831,27 +16693,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX8_ITERATIVE-LABEL: umin_i32_varying: ; GFX8_ITERATIVE: ; %bb.0: ; %entry -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX8_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX8_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 -; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -16860,8 +16722,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB30_4: -; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -16872,27 +16734,27 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX9_ITERATIVE-LABEL: umin_i32_varying: ; GFX9_ITERATIVE: ; %bb.0: ; %entry -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX9_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX9_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s5 ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 -; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -16900,8 +16762,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB30_4: -; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1 @@ -16912,26 +16774,26 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1064_ITERATIVE-LABEL: umin_i32_varying: ; GFX1064_ITERATIVE: ; %bb.0: ; %entry -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s4, -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1064_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 -; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 -; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 @@ -16941,9 +16803,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB30_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 @@ -16954,37 +16815,36 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1032_ITERATIVE-LABEL: umin_i32_varying: ; GFX1032_ITERATIVE: ; %bb.0: ; %entry -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b32 s0, -1 +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1032_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s1 +; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s4 -; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6 -; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 -; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s3, s3, s6 +; GFX1032_ITERATIVE-NEXT: s_min_u32 s2, s2, s5 +; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr0 -; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB30_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 @@ -16995,45 +16855,43 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umin_i32_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s4, -1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s5 +; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s5 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s5 -; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s4, s5 +; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_min_u32 s4, s4, s8 -; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB30_4: -; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -17043,42 +16901,41 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: umin_i32_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s1, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, exec_lo +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 ; GFX1132_ITERATIVE-NEXT: .LBB30_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s1 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s4 +; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s4 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s4 -; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s5 -; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX1132_ITERATIVE-NEXT: s_min_u32 s2, s2, s5 +; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1132_ITERATIVE-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0 +; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB30_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u32 v1, v1, v2 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB30_4: -; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -17088,7 +16945,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; ; GFX7LESS_DPP-LABEL: umin_i32_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -17103,14 +16960,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17125,10 +16982,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8_DPP-NEXT: s_nop 0 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -17137,8 +16994,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB30_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -17152,14 +17009,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17174,10 +17031,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9_DPP-NEXT: s_nop 0 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, 0 @@ -17185,8 +17042,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB30_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, v1 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -17202,7 +17059,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_not_b64 exec, exec ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17218,38 +17075,39 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB30_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1064_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umin_i32_varying: @@ -17258,7 +17116,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf @@ -17267,155 +17125,155 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1032_DPP-NEXT: ; %bb.1: ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB30_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1032_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umin_i32_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164_DPP-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164_DPP-NEXT: s_mov_b32 s3, s7 ; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB30_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1164_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umin_i32_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, s4 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 -; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 ; GFX1132_DPP-NEXT: ; %bb.1: ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, s3 ; GFX1132_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB30_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX1132_DPP-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -17433,7 +17291,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB31_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -17443,8 +17301,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB31_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 @@ -17466,7 +17324,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB31_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 @@ -17476,8 +17334,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB31_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -17499,7 +17357,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB31_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 @@ -17508,8 +17366,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB31_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -17531,7 +17389,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB31_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 @@ -17542,9 +17400,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB31_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_mov_b32 null, 0 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -17563,7 +17420,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB31_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 @@ -17574,9 +17431,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB31_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_mov_b32 null, 0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -17597,7 +17453,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB31_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v0, 5 @@ -17607,8 +17463,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB31_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -17631,7 +17487,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB31_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 @@ -17640,8 +17496,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB31_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -17668,7 +17524,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE: ; %bb.0: ; %entry ; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX7LESS_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX7LESS_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX7LESS_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17676,14 +17532,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s9, v3, s8 ; GFX7LESS_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX7LESS_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[4:5] ; GFX7LESS_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX7LESS_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX7LESS_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX7LESS_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX7LESS_ITERATIVE-NEXT: v_cmp_ne_u64_e64 s[6:7], s[4:5], 0 @@ -17699,14 +17555,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX7LESS_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX7LESS_ITERATIVE-NEXT: ; %bb.3: ; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 -; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS_ITERATIVE-NEXT: .LBB32_4: ; GFX7LESS_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 @@ -17724,7 +17580,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE: ; %bb.0: ; %entry ; GFX8_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX8_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX8_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX8_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17733,13 +17589,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[4:5] ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX8_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX8_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -17753,15 +17609,15 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: .LBB32_4: ; GFX8_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] @@ -17779,7 +17635,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE: ; %bb.0: ; %entry ; GFX9_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 -; GFX9_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX9_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX9_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17788,13 +17644,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: v_readlane_b32 s10, v0, s8 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s10 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v5, s9 -; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] +; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[4:5] ; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8 ; GFX9_ITERATIVE-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0 -; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s9 -; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s3, m0 +; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s9 +; GFX9_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s10 ; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8 ; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -17808,14 +17664,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: .LBB32_4: ; GFX9_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4 ; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3 ; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2] @@ -17833,19 +17689,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE: ; %bb.0: ; %entry ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1064_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[4:5] ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 ; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10 -; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10 -; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 +; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[2:3], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -17859,17 +17715,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1064_ITERATIVE-NEXT: .LBB32_4: ; GFX1064_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] @@ -17885,19 +17740,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE: ; %bb.0: ; %entry ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 +; GFX1032_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 ; GFX1032_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 ; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s5 -; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s5 -; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 +; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[6:7] ; GFX1032_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s4, s4, s5 ; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 @@ -17910,17 +17765,16 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1032_ITERATIVE-NEXT: .LBB32_4: ; GFX1032_ITERATIVE-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032_ITERATIVE-NEXT: s_mov_b32 null, 0 +; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 ; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2] @@ -17934,123 +17788,121 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umin_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1164_ITERATIVE-NEXT: .p2align 6 ; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 -; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s10 +; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s10 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[4:5], s[4:5], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1164_ITERATIVE-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s2 +; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1164_ITERATIVE-NEXT: .LBB32_4: ; GFX1164_ITERATIVE-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc -; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc +; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc +; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_nop 0 ; GFX1164_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_ITERATIVE-NEXT: s_endpgm ; ; GFX1132_ITERATIVE-LABEL: umin_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[2:3], -1 +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1132_ITERATIVE-NEXT: .p2align 6 ; GFX1132_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s5 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s5 -; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s5 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s3, s5 +; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s5 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7] +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[6:7] ; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 -; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s3, s3, s7 +; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s2, s2, s6 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s5, 1, s5 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s4, s4, s5 ; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1132_ITERATIVE-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_ITERATIVE-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132_ITERATIVE-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s4, exec_lo, s4 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 -; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, s3 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv ; GFX1132_ITERATIVE-NEXT: .LBB32_4: ; GFX1132_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4 +; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2] +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo +; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX1132_ITERATIVE-NEXT: s_nop 0 ; GFX1132_ITERATIVE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_ITERATIVE-NEXT: s_endpgm ; ; GFX7LESS_DPP-LABEL: umin_i64_varying: ; GFX7LESS_DPP: ; %bb.0: ; %entry -; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS_DPP-NEXT: s_mov_b32 m0, -1 ; GFX7LESS_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -18067,17 +17919,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX8_DPP-NEXT: s_not_b64 exec, exec ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX8_DPP-NEXT: s_not_b64 exec, exec -; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX8_DPP-NEXT: s_nop 0 @@ -18130,10 +17982,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX8_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX8_DPP-NEXT: ; %bb.1: ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -18142,9 +17994,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB32_2: -; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 @@ -18165,17 +18017,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8 ; GFX9_DPP-NEXT: s_not_b64 exec, exec ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX9_DPP-NEXT: s_not_b64 exec, exec -; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX9_DPP-NEXT: s_nop 0 @@ -18228,10 +18080,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX9_DPP-NEXT: ; %bb.1: ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 @@ -18239,9 +18091,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB32_2: -; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 @@ -18259,10 +18111,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-LABEL: umin_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10 @@ -18270,7 +18122,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1064_DPP-NEXT: s_not_b64 exec, exec -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -18321,72 +18173,71 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1064_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v4, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v3, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v3, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1064_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB32_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: umin_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -18426,74 +18277,72 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1032_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032_DPP-NEXT: v_writelane_b32 v2, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB32_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032_DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[0:3], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: umin_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v9 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1164_DPP-NEXT: s_not_b64 exec, exec -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf @@ -18546,155 +18395,149 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 15 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 15 ; GFX1164_DPP-NEXT: v_readlane_b32 s6, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v3, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v4, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s4, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v3, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v4, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 63 ; GFX1164_DPP-NEXT: v_readlane_b32 s9, v3, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s6, 32 -; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1164_DPP-NEXT: s_mov_b64 s[6:7], exec ; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s4 ; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB32_2: -; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1164_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[9:10] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: umin_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v9 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v10 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, -1 ; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3 +; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_readlane_b32 s3, v4, 15 +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 31 +; GFX1132_DPP-NEXT: v_readlane_b32 s4, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132_DPP-NEXT: v_writelane_b32 v2, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 -; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1132_DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 ; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB32_2: -; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132_DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s1, vcc_lo -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s0, vcc_lo +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v10, v10, s5, vcc_lo +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v9, v9, s4, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[9:10], off, s[0:3], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index f636fa5d83a57a..6be4b893c14a40 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -23,18 +23,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -51,18 +51,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -79,18 +79,18 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -106,10 +106,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -117,10 +117,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -130,25 +129,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -159,7 +157,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -167,7 +165,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -175,8 +173,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -190,24 +188,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -222,7 +220,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -230,7 +228,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -238,8 +236,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -253,24 +251,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -291,23 +289,23 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -320,24 +318,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -350,24 +348,24 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -379,16 +377,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -396,10 +394,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -409,37 +406,37 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -447,7 +444,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -455,8 +452,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -470,41 +467,41 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -512,7 +509,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -520,8 +517,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -535,32 +532,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -573,18 +570,18 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_mov_b32 m0, s5 ; GFX6-NEXT: v_readlane_b32 s8, v0, s5 ; GFX6-NEXT: v_writelane_b32 v1, s4, m0 ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 @@ -593,17 +590,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB2_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB2_4: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -616,36 +613,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -657,36 +654,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -697,38 +694,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -738,37 +734,36 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -778,182 +773,174 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -967,8 +954,8 @@ entry: define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc @@ -980,9 +967,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -993,9 +980,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1005,10 +992,9 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1016,67 +1002,33 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: add_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: add_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm +; GFX11-LABEL: add_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX12W64-LABEL: add_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: add_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX12-LABEL: add_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) @@ -1092,18 +1044,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB4_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB4_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1121,18 +1073,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1150,18 +1102,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1178,10 +1130,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 @@ -1189,10 +1141,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1203,25 +1154,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1233,7 +1183,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1241,7 +1191,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1249,8 +1199,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB4_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1265,24 +1215,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB4_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1298,7 +1248,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1306,7 +1256,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1314,8 +1264,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB4_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1330,24 +1280,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB4_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1369,23 +1319,23 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1398,24 +1348,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1428,24 +1378,24 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1457,16 +1407,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1474,8 +1424,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1487,38 +1437,38 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1526,7 +1476,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1534,8 +1484,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1550,42 +1500,42 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1593,7 +1543,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_mul_i32 s4, s6, s4 @@ -1601,8 +1551,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1617,33 +1567,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1656,18 +1606,18 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB6_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_mov_b32 m0, s5 ; GFX6-NEXT: v_readlane_b32 s8, v0, s5 ; GFX6-NEXT: v_writelane_b32 v1, s4, m0 ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB6_1 @@ -1676,17 +1626,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB6_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_4: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1699,36 +1649,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB6_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1740,36 +1690,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB6_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1780,38 +1730,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1821,37 +1770,36 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB6_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1861,184 +1809,176 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB6_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2052,8 +1992,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc @@ -2065,9 +2005,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2078,9 +2018,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2090,10 +2030,9 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2101,73 +2040,36 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: sub_i32_varying_offset: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm +; GFX11-LABEL: sub_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ; -; GFX11W32-LABEL: sub_i32_varying_offset: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: sub_i32_varying_offset: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: sub_i32_varying_offset: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX12-LABEL: sub_i32_varying_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0) store i32 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 3e8565d34c6beb..419d8bc5cc747f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -23,10 +23,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -34,8 +34,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -52,10 +52,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -63,8 +63,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 @@ -81,10 +81,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -92,8 +92,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -109,10 +109,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 @@ -121,10 +121,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -134,26 +133,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -164,7 +162,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -172,7 +170,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -181,8 +179,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 @@ -196,25 +194,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 @@ -229,7 +227,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: add_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -237,7 +235,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -246,8 +244,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB0_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 @@ -261,24 +259,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: add_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB0_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 @@ -299,15 +297,15 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 @@ -315,8 +313,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -329,16 +327,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 @@ -346,8 +344,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -360,16 +358,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 @@ -377,8 +375,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -390,16 +388,16 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) @@ -408,10 +406,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] @@ -421,38 +418,38 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[4:5] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -460,7 +457,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -469,8 +466,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -484,42 +481,42 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -527,7 +524,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -536,8 +533,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB1_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -551,32 +548,32 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: add_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB1_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -589,18 +586,18 @@ entry: define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_mov_b32 m0, s5 ; GFX6-NEXT: v_readlane_b32 s8, v0, s5 ; GFX6-NEXT: v_writelane_b32 v1, s4, m0 ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1 @@ -609,18 +606,18 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB2_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB2_4: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -633,37 +630,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB2_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB2_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 @@ -675,37 +672,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB2_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -716,39 +713,38 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB2_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -758,38 +754,37 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB2_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 @@ -799,184 +794,178 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB2_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB2_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: add_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB2_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: add_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB2_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -990,8 +979,8 @@ entry: define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: add_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc @@ -1003,9 +992,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1016,9 +1005,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1028,10 +1017,9 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: add_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1039,67 +1027,33 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: add_i32_varying_vindex: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: add_i32_varying_vindex: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: add_i32_varying_vindex: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: add_i32_varying_vindex: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: add_i32_varying_vindex: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: add_i32_varying_vindex: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -1111,10 +1065,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: add_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc @@ -1126,14 +1080,15 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: add_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1142,27 +1097,27 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: add_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: add_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -1172,12 +1127,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W64-LABEL: add_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_mov_b32 s0, 0 -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 @@ -1189,12 +1144,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W32-LABEL: add_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 @@ -1206,12 +1161,11 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W64-LABEL: add_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1222,11 +1176,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W32-LABEL: add_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -1249,10 +1202,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_mul_i32 s4, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -1260,8 +1213,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1279,10 +1232,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 @@ -1290,8 +1243,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1309,10 +1262,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 @@ -1320,8 +1273,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1338,10 +1291,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1350,10 +1303,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 @@ -1364,26 +1316,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 @@ -1395,7 +1346,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1403,7 +1354,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1412,8 +1363,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1428,25 +1379,25 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s1, s1, 5 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1462,7 +1413,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-LABEL: sub_i32_constant: ; GFX12W64: ; %bb.0: ; %entry ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1470,7 +1421,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_mul_i32 s4, s4, 5 @@ -1479,8 +1430,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB5_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1495,24 +1446,24 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12W32-LABEL: sub_i32_constant: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, exec_lo -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, exec_lo +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: s_mul_i32 s1, s1, 5 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB5_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1534,15 +1485,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_load_dword s6, s[2:3], 0x11 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mul_i32 s4, s6, s4 @@ -1550,8 +1501,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1564,16 +1515,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s4, s6, s4 @@ -1581,8 +1532,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1595,16 +1546,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s4, s6, s4 @@ -1612,8 +1563,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1625,16 +1576,16 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dword s6, s[2:3], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,8 +1594,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) @@ -1656,39 +1607,39 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1696,7 +1647,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,8 +1656,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) @@ -1721,43 +1672,43 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_uniform: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44 +; GFX12W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX12W64-NEXT: s_mov_b64 s[4:5], exec -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1765,7 +1716,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W64-NEXT: ; %bb.1: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 @@ -1774,8 +1725,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB6_2: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -1790,33 +1741,33 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12W32-LABEL: sub_i32_uniform: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44 +; GFX12W32-NEXT: s_load_b32 s2, s[0:1], 0x44 ; GFX12W32-NEXT: s_mov_b32 s4, exec_lo -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo ; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX12W32-NEXT: ; %bb.1: -; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: s_mul_i32 s4, s0, s4 +; GFX12W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB6_2: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -1829,18 +1780,18 @@ entry: define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vdata: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_mov_b32 m0, s5 ; GFX6-NEXT: v_readlane_b32 s8, v0, s5 ; GFX6-NEXT: v_writelane_b32 v1, s4, m0 ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1 @@ -1849,18 +1800,18 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB7_4 ; GFX6-NEXT: ; %bb.3: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB7_4: -; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1873,37 +1824,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_mov_b32 m0, s5 ; GFX8-NEXT: v_readlane_b32 s8, v0, s5 ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_writelane_b32 v1, s4, m0 ; GFX8-NEXT: s_add_i32 s4, s4, s8 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB7_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB7_4: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 @@ -1915,37 +1866,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX9-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9-NEXT: s_mov_b32 m0, s5 ; GFX9-NEXT: v_readlane_b32 s8, v0, s5 ; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX9-NEXT: v_writelane_b32 v1, s4, m0 ; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB7_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1956,39 +1907,38 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_mov_b64 s[0:1], exec +; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX10W64-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX10W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX10W64-NEXT: v_writelane_b32 v1, s4, s5 -; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX10W64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX10W64-NEXT: s_add_i32 s4, s4, s8 -; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX10W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W64-NEXT: ; %bb.3: -; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB7_4: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: s_mov_b32 null, 0 ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -1998,38 +1948,37 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10W32-NEXT: s_mov_b32 s0, 0 +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo +; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 ; GFX10W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10W32-NEXT: s_ff1_i32_b32 s4, s1 +; GFX10W32-NEXT: s_ff1_i32_b32 s4, s3 ; GFX10W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX10W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX10W32-NEXT: v_writelane_b32 v1, s0, s4 -; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6 -; GFX10W32-NEXT: s_add_i32 s0, s0, s5 -; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX10W32-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10W32-NEXT: s_add_i32 s2, s2, s5 +; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX10W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10W32-NEXT: ; %bb.3: -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB7_4: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: s_mov_b32 null, 0 ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 @@ -2039,186 +1988,180 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: s_mov_b64 s[0:1], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX11W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX11W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX11W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX11W64-NEXT: s_add_i32 s4, s4, s8 -; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX11W64-NEXT: ; implicit-def: $vgpr1 -; GFX11W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB7_4: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W32-NEXT: s_mov_b32 s1, exec_lo -; GFX11W32-NEXT: s_mov_b32 s0, 0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX11W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX11W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX11W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: s_add_i32 s0, s0, s5 -; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX11W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11W32-NEXT: s_add_i32 s2, s2, s5 +; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11W32-NEXT: ; implicit-def: $vgpr1 -; GFX11W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX11W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB7_4: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm ; ; GFX12W64-LABEL: sub_i32_varying_vdata: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W64-NEXT: s_mov_b64 s[0:1], exec +; GFX12W64-NEXT: s_mov_b64 s[2:3], exec ; GFX12W64-NEXT: s_mov_b32 s4, 0 -; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: ; implicit-def: $vgpr1 ; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5 +; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[2:3] +; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W64-NEXT: v_readlane_b32 s8, v0, s5 ; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5 -; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7] -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12W64-NEXT: v_writelane_b32 v1, s4, s5 +; GFX12W64-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX12W64-NEXT: s_add_co_i32 s4, s4, s8 -; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 -; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX12W64-NEXT: ; implicit-def: $vgpr1 -; GFX12W64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX12W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12W64-NEXT: ; implicit-def: $vgpr0 +; GFX12W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX12W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX12W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W64-NEXT: ; %bb.3: -; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34 +; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX12W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN +; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: .LBB7_4: -; GFX12W64-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W64-NEXT: s_endpgm ; ; GFX12W32-LABEL: sub_i32_varying_vdata: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX12W32-NEXT: s_mov_b32 s1, exec_lo -; GFX12W32-NEXT: s_mov_b32 s0, 0 -; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_mov_b32 s3, exec_lo +; GFX12W32-NEXT: s_mov_b32 s2, 0 +; GFX12W32-NEXT: ; implicit-def: $vgpr1 ; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop ; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4 +; GFX12W32-NEXT: s_ctz_i32_b32 s4, s3 +; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12W32-NEXT: v_readlane_b32 s5, v0, s4 ; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4 -; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4 -; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6 -; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5 -; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12W32-NEXT: v_writelane_b32 v1, s2, s4 +; GFX12W32-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX12W32-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX12W32-NEXT: ; implicit-def: $vgpr1 -; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX12W32-NEXT: ; implicit-def: $vgpr0 +; GFX12W32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX12W32-NEXT: ; %bb.3: -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: .LBB7_4: -; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 -; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12W32-NEXT: s_endpgm @@ -2232,8 +2175,8 @@ entry: define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr addrspace(8) %inout) { ; GFX6-LABEL: sub_i32_varying_vindex: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v1, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc @@ -2245,9 +2188,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_vindex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2258,9 +2201,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_vindex: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2270,10 +2213,9 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: sub_i32_varying_vindex: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2281,67 +2223,33 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11W64-LABEL: sub_i32_varying_vindex: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W64-NEXT: s_nop 0 -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W64-NEXT: s_endpgm -; -; GFX11W32-LABEL: sub_i32_varying_vindex: -; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11W32-NEXT: s_nop 0 -; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11W32-NEXT: s_endpgm -; -; GFX12W64-LABEL: sub_i32_varying_vindex: -; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX12W64-NEXT: s_wait_kmcnt 0x0 -; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W64-NEXT: s_wait_loadcnt 0x0 -; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W64-NEXT: s_nop 0 -; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W64-NEXT: s_endpgm -; -; GFX12W32-LABEL: sub_i32_varying_vindex: -; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12W32-NEXT: s_wait_kmcnt 0x0 -; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN -; GFX12W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX12W32-NEXT: s_wait_loadcnt 0x0 -; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12W32-NEXT: s_nop 0 -; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12W32-NEXT: s_endpgm +; GFX11-LABEL: sub_i32_varying_vindex: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: sub_i32_varying_vindex: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -2353,10 +2261,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; GFX6-LABEL: sub_i32_varying_offset: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, 1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc @@ -2368,14 +2276,15 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: sub_i32_varying_offset: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2384,27 +2293,27 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: sub_i32_varying_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sub_i32_varying_offset: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -2414,12 +2323,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W64-LABEL: sub_i32_varying_offset: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W64-NEXT: s_mov_b32 s0, 0 -; GFX11W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX11W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 @@ -2431,12 +2340,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX11W32-LABEL: sub_i32_varying_offset: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11W32-NEXT: s_mov_b32 s0, 0 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 -; GFX11W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 @@ -2448,12 +2357,11 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W64-LABEL: sub_i32_varying_offset: ; GFX12W64: ; %bb.0: ; %entry -; GFX12W64-NEXT: s_clause 0x1 -; GFX12W64-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W64-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W64-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W64-NEXT: v_mov_b32_e32 v1, v0 ; GFX12W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX12W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W64-NEXT: s_wait_loadcnt 0x0 @@ -2464,11 +2372,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add ; ; GFX12W32-LABEL: sub_i32_varying_offset: ; GFX12W32: ; %bb.0: ; %entry -; GFX12W32-NEXT: s_clause 0x1 -; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 +; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 ; GFX12W32-NEXT: v_mov_b32_e32 v2, 1 +; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN ; GFX12W32-NEXT: s_wait_loadcnt 0x0 @@ -2482,6 +2389,3 @@ entry: store i32 %old, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} -; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 417d38990505b6..2c69ae58f0e611 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -33,7 +33,7 @@ entry: define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -42,7 +42,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 @@ -58,8 +58,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 @@ -72,8 +72,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 @@ -92,7 +92,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %a ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN @@ -114,7 +114,7 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 @@ -124,7 +124,7 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 @@ -140,11 +140,11 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] @@ -155,8 +155,8 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr ; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN @@ -175,7 +175,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -185,7 +185,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -201,7 +201,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -211,7 +211,7 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; ; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -227,7 +227,7 @@ entry: define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { ; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 @@ -238,7 +238,7 @@ define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ; ; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 14519f5a5e77c0..7da058ca6ee7e7 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 20 +; CHECK: NumSGPRsForWavesPerEU: 12 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, ptr addrspace(1) @var diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index 90562e25a3e9c1..b2f01660201d7e 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -1,6 +1,6 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O2 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s ; Check that AMDGPUAttributor is not run with -O0. ; OPT: .amdhsa_user_sgpr_private_segment_buffer 1 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index c6f51b785e30e2..17eb54a81d827f 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3781,21 +3781,21 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v2, s30, 0 ; GCN-NEXT: v_writelane_b32 v2, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen @@ -3806,27 +3806,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v2, s30, 0 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_readlane_b32 s30, v2, 0 @@ -3837,27 +3837,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -3866,27 +3866,27 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3895,28 +3895,28 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3926,7 +3926,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3968,21 +3968,21 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v4, s30, 0 ; GCN-NEXT: v_writelane_b32 v4, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 2, v2 @@ -3998,27 +3998,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -4034,27 +4034,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4063,27 +4063,27 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4092,28 +4092,28 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4123,7 +4123,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4165,21 +4165,21 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v5, s30, 0 ; GCN-NEXT: v_writelane_b32 v5, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4197,27 +4197,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4235,27 +4235,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_writelane_b32 v4, s30, 0 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen @@ -4267,27 +4267,27 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4298,28 +4298,28 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4331,7 +4331,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4375,21 +4375,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v8, s30, 0 ; GCN-NEXT: v_writelane_b32 v8, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4415,27 +4415,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v6, s30, 0 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v6, s31, 1 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -4461,27 +4461,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_writelane_b32 v4, s30, 0 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: v_writelane_b32 v4, s31, 1 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 ; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -4493,27 +4493,27 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4524,28 +4524,28 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4557,7 +4557,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,21 +4599,21 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v16, s30, 0 ; GCN-NEXT: v_writelane_b32 v16, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4659,27 +4659,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v10, s30, 0 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v10, s31, 1 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 @@ -4725,27 +4725,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_writelane_b32 v6, s30, 0 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: v_writelane_b32 v6, s31, 1 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 12, v4 ; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4763,27 +4763,27 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v5, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v5, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s30, v5, 0 ; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4798,28 +4798,28 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v5, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s30, v5, 0 ; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4835,7 +4835,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -4877,21 +4877,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_writelane_b32 v21, s30, 0 ; GCN-NEXT: v_writelane_b32 v21, s31, 1 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -4977,27 +4977,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s18, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 -; GFX7-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX7-NEXT: s_mov_b64 exec, s[16:17] +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v18, s30, 0 ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v18, s31, 1 -; GFX7-NEXT: s_getpc_b64 s[16:17] -; GFX7-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX7-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX7-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX7-NEXT: s_getpc_b64 s[4:5] +; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 @@ -5083,27 +5083,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s18 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 -; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: v_writelane_b32 v10, s30, 0 ; GFX8-NEXT: s_addk_i32 s32, 0x400 ; GFX8-NEXT: v_writelane_b32 v10, s31, 1 -; GFX8-NEXT: s_getpc_b64 s[16:17] -; GFX8-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX8-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: s_getpc_b64 s[4:5] +; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 28, v8 ; GFX8-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -5133,27 +5133,27 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v9, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v9, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s30, v9, 0 ; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5176,28 +5176,28 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v9, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[16:17] -; GFX10-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s30, v9, 0 ; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5221,7 +5221,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -27297,7 +27297,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_and_b32 s4, s6, 0x80000000 +; GCN-NEXT: s_and_b32 s4, s4, 0x80000000 ; GCN-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27308,7 +27308,7 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_and_b32 s4, s6, 0x80000000 +; GFX7-NEXT: s_and_b32 s4, s4, 0x80000000 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 @@ -27318,23 +27318,23 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX8-LABEL: v_copysign_bf16_s_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_bf16_s_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_movk_i32 s5, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_bf16_s_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s6 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_bf16_s_bf16: @@ -27350,7 +27350,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_s_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27361,7 +27361,7 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX7-LABEL: v_copysign_s_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 @@ -27372,23 +27372,23 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX8-LABEL: v_copysign_s_bf16_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_s_bf16_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX9-NEXT: s_movk_i32 s5, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_s_bf16_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s6, v0 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_s_bf16_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll index 2c179de2a9c35c..0f20ed1320dad7 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -24,11 +24,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; VI-SDWA-LABEL: bfe_combine8: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -42,13 +42,13 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) ; ; CI-LABEL: bfe_combine8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 6, v0 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: v_and_b32_e32 v0, 0x3fc, v0 @@ -71,11 +71,11 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) { ; VI-LABEL: bfe_combine16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_bfe_u32 v0, v0, 16, 16 ; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -91,11 +91,11 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; VI-SDWA-LABEL: bfe_combine16: ; VI-SDWA: ; %bb.0: -; VI-SDWA-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15 ; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0 ; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -111,13 +111,13 @@ define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x ; ; CI-LABEL: bfe_combine16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index f54ea615ca6645..af4116bd6aae5d 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -115,7 +115,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -221,7 +221,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -313,7 +313,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -329,7 +329,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -373,7 +373,7 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -417,8 +417,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -444,8 +444,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -462,8 +462,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou ; ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -491,8 +491,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -509,8 +509,8 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ; ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[6:7], 0x0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 78d764898a3b93..7b8eaccaa4142b 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -11,50 +11,52 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_def_i32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_andn2_b32 s6, s6, s4 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s6, s4 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 +; GFX7-NEXT: s_andn2_b32 s4, s8, s6 +; GFX7-NEXT: s_and_b32 s5, s7, s6 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_def_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_andn2_b32 s2, s6, s4 -; GFX8-NEXT: s_and_b32 s3, s5, s4 -; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_and_b32 s1, s7, s6 +; GFX8-NEXT: s_andn2_b32 s0, s0, s6 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_def_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s2, s6, s4 -; GFX10-NEXT: s_and_b32 s3, s5, s4 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_and_b32 s1, s7, s6 +; GFX10-NEXT: s_andn2_b32 s0, s0, s6 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_def_i32: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6 ; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6 @@ -68,8 +70,8 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX10-GISEL-LABEL: s_bfi_def_i32: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6 @@ -130,50 +132,52 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s4, s4, s5 -; GFX7-NEXT: s_xor_b32 s4, s6, s4 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_xor_b32 s4, s7, s8 +; GFX7-NEXT: s_and_b32 s4, s6, s4 +; GFX7-NEXT: s_xor_b32 s4, s8, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s2, s5, s6 -; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_xor_b32 s2, s6, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b32 s1, s7, s0 +; GFX8-NEXT: s_and_b32 s1, s6, s1 +; GFX8-NEXT: s_xor_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s2, s5, s6 -; GFX10-NEXT: s_and_b32 s2, s4, s2 -; GFX10-NEXT: s_xor_b32 s2, s6, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_xor_b32 s1, s7, s0 +; GFX10-NEXT: s_and_b32 s1, s6, s1 +; GFX10-NEXT: s_xor_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0 @@ -187,8 +191,8 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-GISEL-LABEL: s_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0 @@ -454,53 +458,55 @@ entry: define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ma: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_or_b32 s4, s4, s6 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s7, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 +; GFX7-NEXT: s_or_b32 s5, s6, s8 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_and_b32 s4, s6, s8 +; GFX7-NEXT: s_and_b32 s5, s7, s5 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ma: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s3, s4, s6 -; GFX8-NEXT: s_and_b32 s2, s4, s6 -; GFX8-NEXT: s_and_b32 s3, s5, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_and_b32 s1, s6, s0 +; GFX8-NEXT: s_or_b32 s0, s6, s0 +; GFX8-NEXT: s_and_b32 s0, s7, s0 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ma: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_or_b32 s2, s4, s6 -; GFX10-NEXT: s_and_b32 s3, s4, s6 -; GFX10-NEXT: s_and_b32 s2, s5, s2 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_or_b32 s1, s6, s0 +; GFX10-NEXT: s_and_b32 s0, s6, s0 +; GFX10-NEXT: s_and_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0 @@ -515,8 +521,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-GISEL-LABEL: s_bfi_sha256_ma: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0 @@ -1402,8 +1408,8 @@ entry: define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_0: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1419,8 +1425,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1435,8 +1441,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1450,8 +1456,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1466,8 +1472,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] @@ -1490,8 +1496,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1507,8 +1513,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1523,8 +1529,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1538,8 +1544,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1554,8 +1560,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1578,8 +1584,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX7-LABEL: s_bitselect_i64_pat_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1595,8 +1601,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-LABEL: s_bitselect_i64_pat_2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1611,8 +1617,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-LABEL: s_bitselect_i64_pat_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1626,8 +1632,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1642,8 +1648,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] @@ -1666,8 +1672,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX7-LABEL: s_bfi_sha256_ma_i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1684,8 +1690,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; ; GFX8-LABEL: s_bfi_sha256_ma_i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] ; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] @@ -1701,8 +1707,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-LABEL: s_bfi_sha256_ma_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] ; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] @@ -1717,8 +1723,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] @@ -1734,8 +1740,8 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { ; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] ; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll index 4ad3667f689583..0f40576a7459cc 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll @@ -283,7 +283,7 @@ define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) { define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index 2e64db12ef564c..f8bd44b7c98f59 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 { ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfm_b32 s2, s2, s3 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfm_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -36,11 +36,11 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 { ; SI-LABEL: s_bfm_pattern_simple: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s4, s4, 0 +; SI-NEXT: s_bfm_b32 s4, s2, 0 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -48,10 +48,10 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) # ; ; VI-LABEL: s_bfm_pattern_simple: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfm_b32 s2, s4, 0 +; VI-NEXT: s_bfm_b32 s2, s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 6f52da2631b8a6..64555f14a55cc1 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -21,8 +21,8 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; FLAT-LABEL: s_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; ; GISEL-LABEL: s_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -62,10 +62,10 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-FLAT-LABEL: s_brev_i16: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] @@ -76,11 +76,11 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-LABEL: s_brev_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_mov_b32 s10, s6 @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i16: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] @@ -187,8 +187,8 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +199,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; FLAT-LABEL: s_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dword s4, s[2:3], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -211,10 +211,10 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; ; GISEL-LABEL: s_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b32 s2, s4 +; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -224,11 +224,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-FLAT-LABEL: s_brev_i32: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s4 +; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 @@ -240,11 +240,11 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GFX11-GISEL-LABEL: s_brev_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s4 +; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -259,7 +259,7 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -278,7 +278,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -294,7 +294,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -311,9 +311,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3] @@ -328,10 +326,8 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -351,7 +347,7 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -366,7 +362,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -381,7 +377,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 @@ -394,7 +390,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -411,7 +407,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; GFX11-GISEL-LABEL: s_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 @@ -430,7 +426,7 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -450,7 +446,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -467,7 +463,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -485,9 +481,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -503,11 +497,9 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -528,7 +520,7 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -542,7 +534,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -556,7 +548,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -568,7 +560,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-FLAT-LABEL: s_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 @@ -581,7 +573,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; GFX11-GISEL-LABEL: s_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] @@ -599,7 +591,7 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -619,7 +611,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -636,7 +628,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -654,9 +646,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -672,9 +662,7 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-GISEL-LABEL: v_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -697,8 +685,8 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 { ; SI-LABEL: s_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -713,8 +701,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; FLAT-LABEL: s_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) @@ -729,8 +717,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; ; GISEL-LABEL: s_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] ; GISEL-NEXT: s_brev_b64 s[2:3], s[6:7] @@ -746,8 +734,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-FLAT-LABEL: s_brev_v2i64: ; GFX11-FLAT: ; %bb.0: ; GFX11-FLAT-NEXT: s_clause 0x1 -; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5] ; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7] @@ -763,8 +751,8 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GFX11-GISEL-LABEL: s_brev_v2i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[4:5] @@ -783,7 +771,7 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -805,7 +793,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 @@ -824,7 +812,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -844,9 +832,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3] @@ -864,9 +850,7 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-GISEL-LABEL: v_brev_v2i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 857b13fab8a7ce..3dbbb877918ad2 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -6,18 +6,18 @@ define amdgpu_kernel void @br_cc_f16( ; SI-LABEL: br_cc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -29,28 +29,28 @@ define amdgpu_kernel void @br_cc_f16( ; SI-NEXT: .LBB0_2: ; %two ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: .LBB0_3: ; %one -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: br_cc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_cbranch_vccnz .LBB0_2 ; VI-NEXT: ; %bb.1: ; %one @@ -63,8 +63,8 @@ define amdgpu_kernel void @br_cc_f16( ; GFX11-LABEL: br_cc_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s2 @@ -111,7 +111,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_a( ; SI-LABEL: br_cc_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -137,7 +137,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; VI-LABEL: br_cc_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -157,7 +157,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; ; GFX11-LABEL: br_cc_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +195,7 @@ two: define amdgpu_kernel void @br_cc_f16_imm_b( ; SI-LABEL: br_cc_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -221,7 +221,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; VI-LABEL: br_cc_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; ; GFX11-LABEL: br_cc_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index a77fc78e9aaf5e..0d2acd3973bbd0 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,10 +4,10 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s44, s[6:7], 0x2 +; CHECK-NEXT: s_load_dword s44, s[4:5], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] -; CHECK-NEXT: s_add_u32 s96, s96, s13 +; CHECK-NEXT: s_add_u32 s96, s96, s7 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s44, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 635f3e4886b875..2f637df4e93022 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -22,9 +22,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART @@ -34,10 +34,10 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -63,9 +63,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -81,10 +81,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -110,9 +110,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 @@ -130,10 +130,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -158,7 +158,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -261,28 +261,28 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_mov_b64 s[0:1], -1 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc5: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc5)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc5)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB5_1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -300,17 +300,17 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc6: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc4: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc4)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc4)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -375,7 +375,7 @@ loop: define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GCN-LABEL: expand_requires_expand: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lt_i32 s0, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -453,8 +453,8 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB8_1: ; %if -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -572,10 +572,10 @@ ret: define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 { ; GCN-LABEL: long_branch_hang: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s7, 6 @@ -607,25 +607,25 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GCN-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-NEXT: ; %bb.10: ; %Flow5 -; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_getpc_b64 s[2:3] ; GCN-NEXT: .Lpost_getpc13: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32 -; GCN-NEXT: s_setpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s2, s2, (.LBB10_6-.Lpost_getpc13)&4294967295 +; GCN-NEXT: s_addc_u32 s3, s3, (.LBB10_6-.Lpost_getpc13)>>32 +; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB10_5: ; %bb14 ; GCN-NEXT: s_cmp_lt_i32 s5, 9 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s6, s7 ; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GCN-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GCN-NEXT: s_branch .LBB10_7 ; GCN-NEXT: .LBB10_6: ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB10_7: ; %bb19 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xf +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 321a7ceb826f6e..e4c7df385d8619 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -19,7 +19,7 @@ declare i48 @llvm.bswap.i48(i48) #1 define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -34,7 +34,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -87,7 +87,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -166,7 +166,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -226,7 +226,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -249,7 +249,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v8i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -296,7 +296,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_bswap_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -312,7 +312,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_bswap_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -333,7 +333,7 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -357,7 +357,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -375,7 +375,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -398,7 +398,7 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: test_bswap_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v4, 0x10203 ; VI-NEXT: s_mov_b32 s15, 0xf000 ; VI-NEXT: s_mov_b32 s14, -1 @@ -458,7 +458,7 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: test_bswap_v4i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 23e8f98a7861bc..8e47be7f2a0b1d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -21,7 +21,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -32,7 +32,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -42,7 +42,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -54,15 +54,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -71,27 +67,23 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -100,15 +92,11 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -116,30 +104,26 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -147,30 +131,26 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -178,30 +158,26 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -210,15 +186,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB0_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -234,7 +210,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 @@ -245,7 +221,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -255,7 +231,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -266,15 +242,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -282,28 +254,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -311,12 +279,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -324,75 +288,63 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v1, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v2, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB1_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -400,16 +352,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v2, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB1_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -846,8 +798,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -877,7 +829,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -888,8 +840,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -920,16 +872,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_addk_i32 s8, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -941,27 +889,23 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -969,17 +913,13 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -990,31 +930,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX908-NEXT: v_mov_b32_e32 v1, v8 ; GFX908-NEXT: v_mov_b32_e32 v2, v9 ; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1025,31 +961,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v2, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s10, s8, 0x800 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1060,31 +992,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX7-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-NEXT: v_mov_b32_e32 v2, v9 ; GFX7-NEXT: v_mov_b32_e32 v3, v10 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB3_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s10, s8, 0x800 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 ; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1096,15 +1024,15 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) ; GFX6-NEXT: v_mov_b32_e32 v1, v8 ; GFX6-NEXT: v_mov_b32_e32 v2, v9 ; GFX6-NEXT: v_mov_b32_e32 v3, v10 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB3_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -1120,8 +1048,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 @@ -1150,7 +1078,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1160,8 +1088,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -1191,15 +1119,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x800 -; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1209,29 +1133,25 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; GFX10-NEXT: v_mov_b32_e32 v7, v2 -; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5] ; GFX10-NEXT: v_mov_b32_e32 v4, v7 ; GFX10-NEXT: v_mov_b32_e32 v5, v8 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1239,15 +1159,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -1256,31 +1172,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX908-NEXT: v_mov_b32_e32 v9, v4 ; GFX908-NEXT: v_mov_b32_e32 v8, v3 ; GFX908-NEXT: v_mov_b32_e32 v7, v2 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v5, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1289,31 +1201,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX8-NEXT: v_mov_b32_e32 v9, v4 ; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v7 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v5, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX7-NEXT: s_add_i32 s6, s18, 0x800 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s10, s8, 0x800 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1322,31 +1230,27 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX7-NEXT: v_mov_b32_e32 v9, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v5, v8 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB4_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[8:11], 0 offen offset:2048 -; GFX6-NEXT: s_add_i32 s6, s18, 0x800 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v6, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s10, s8, 0x800 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 ; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1356,17 +1260,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) ; GFX6-NEXT: v_mov_b32_e32 v9, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v5, v8 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB4_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -1913,16 +1817,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v5, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1954,11 +1858,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -1988,16 +1892,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2030,243 +1934,219 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v5, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v3 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s9 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3 ; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v5, s4 -; GFX908-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v5, s9 +; GFX908-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v2, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: buffer_load_dword v2, v5, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_and_b32_e32 v3, s11, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v2, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v3 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB6_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB6_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2283,16 +2163,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v3, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2323,11 +2203,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -2356,16 +2236,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s4, 0x200 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v3, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2397,236 +2277,212 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_addk_i32 s8, 0x200 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: buffer_load_dword v3, v1, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s9 +; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s6, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3 ; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s6, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v3, s4 -; GFX908-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v3, s9 +; GFX908-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2 ; GFX908-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX908-NEXT: v_lshlrev_b32_e32 v1, s6, v1 -; GFX908-NEXT: v_and_or_b32 v1, v2, s7, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1 +; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v2, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, s6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2 ; GFX8-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB7_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB7_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -3282,17 +3138,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3332,11 +3187,11 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -3374,17 +3229,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start @@ -3426,24 +3280,20 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -3451,133 +3301,121 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB9_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_add_f32_e32 v3, v3, v5 @@ -3587,110 +3425,102 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB9_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB9_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3707,17 +3537,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3756,11 +3585,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 @@ -3797,17 +3626,16 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start @@ -3848,24 +3676,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -3873,130 +3697,118 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 @@ -4006,106 +3818,98 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 @@ -4818,7 +4622,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -4829,7 +4633,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -4839,10 +4643,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start @@ -4870,15 +4675,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4887,27 +4688,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4916,15 +4713,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -4932,30 +4725,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -4965,37 +4754,33 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -5012,40 +4797,36 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -5063,18 +4844,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 @@ -5090,7 +4871,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 @@ -5101,7 +4882,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -5111,8 +4892,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 @@ -5142,15 +4923,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5158,28 +4935,24 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5187,12 +4960,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], 0 offen offset:1024 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] @@ -5200,15 +4969,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v2, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -5217,38 +4982,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -5265,40 +5026,36 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -5316,18 +5073,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 @@ -5844,7 +5601,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -5856,9 +5613,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -5900,8 +5657,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5954,17 +5711,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5981,41 +5734,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6031,39 +5780,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6079,38 +5824,34 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -6128,43 +5869,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -6179,38 +5916,34 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 ; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -6226,17 +5959,17 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -6252,7 +5985,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 @@ -6263,9 +5996,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff @@ -6307,8 +6040,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 @@ -6356,17 +6089,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6382,41 +6111,37 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6431,39 +6156,35 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -6478,38 +6199,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -6526,44 +6243,40 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6578,38 +6291,34 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6625,17 +6334,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 @@ -7313,10 +7022,11 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start @@ -7343,7 +7053,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -7353,10 +7063,11 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start @@ -7384,15 +7095,11 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -7401,31 +7108,27 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -7433,31 +7136,27 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -7465,30 +7164,26 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -7496,30 +7191,26 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7527,30 +7218,26 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7559,15 +7246,15 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index ec0408236975d1..df19d135980439 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -13,15 +13,15 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -29,13 +29,13 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -58,10 +58,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -69,35 +69,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -105,31 +97,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -138,31 +126,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -171,57 +155,49 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 @@ -229,12 +205,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -257,10 +233,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -268,65 +244,53 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -334,31 +298,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -366,51 +326,43 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -444,7 +396,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -505,7 +457,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -535,7 +487,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -564,7 +516,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -623,7 +575,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -683,7 +635,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -743,7 +695,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -769,7 +721,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -796,863 +748,1553 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] -; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB5_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB5_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB5_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result } @@ -1660,27 +2302,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1703,26 +2344,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1739,29 +2380,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1784,289 +2424,264 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2089,25 +2704,25 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2124,28 +2739,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2168,261 +2782,237 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2438,7 +3028,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -2451,14 +3041,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -2474,7 +3064,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -2489,8 +3079,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2499,13 +3089,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_3 +; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2516,7 +3106,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -2528,14 +3118,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2545,7 +3135,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -2559,8 +3149,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2568,13 +3158,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB8_3 +; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2587,7 +3177,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -2600,14 +3190,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -2622,7 +3212,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -2637,8 +3227,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2648,13 +3238,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_3 +; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2665,7 +3255,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -2677,13 +3267,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -2694,7 +3284,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -2708,8 +3298,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2719,13 +3309,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB8_3 +; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2736,7 +3326,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -2748,14 +3338,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2764,7 +3354,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -2777,8 +3367,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2786,13 +3376,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2803,7 +3393,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -2815,14 +3405,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2832,7 +3422,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -2845,8 +3435,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2854,13 +3444,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB8_3 +; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -2871,7 +3461,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -2883,14 +3473,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2901,7 +3491,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -2914,8 +3504,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2923,13 +3513,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB8_3 +; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -2939,7 +3529,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -2950,15 +3540,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -2970,7 +3560,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -2983,8 +3573,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2992,14 +3582,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB8_3 +; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3009,7 +3599,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3020,15 +3610,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3040,7 +3630,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3053,8 +3643,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3062,7 +3652,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB8_3 +; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3070,7 +3660,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -3078,27 +3668,26 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3127,27 +3716,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3169,29 +3758,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3221,33 +3809,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -3255,133 +3839,121 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v3, v3, v5 @@ -3391,141 +3963,132 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3554,26 +4117,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3595,28 +4158,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3646,32 +4208,28 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -3679,130 +4237,118 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 @@ -3812,117 +4358,109 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3938,7 +4476,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3951,14 +4489,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3980,7 +4518,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3995,8 +4533,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4005,13 +4543,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB11_3 +; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4022,7 +4560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4034,15 +4572,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4057,7 +4595,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4071,8 +4609,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4080,13 +4618,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB11_3 +; GFX940-NEXT: s_cbranch_execnz .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4099,7 +4637,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4112,15 +4650,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4142,7 +4680,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4157,8 +4695,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4168,14 +4706,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4186,7 +4724,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4198,13 +4736,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4219,7 +4757,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4233,8 +4771,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4244,13 +4782,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_3 +; GFX10-NEXT: s_cbranch_execnz .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4261,7 +4799,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4273,15 +4811,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11 @@ -4294,7 +4832,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4307,8 +4845,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4316,13 +4854,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4333,7 +4871,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4345,15 +4883,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4367,7 +4905,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4380,8 +4918,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4389,13 +4927,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB11_3 +; GFX908-NEXT: s_cbranch_execnz .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4406,7 +4944,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4418,14 +4956,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f32_e32 v4, v4, v10 @@ -4441,7 +4979,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4454,8 +4992,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4463,13 +5001,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB11_3 +; GFX8-NEXT: s_cbranch_execnz .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4479,7 +5017,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -4490,15 +5028,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4511,7 +5049,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -4524,8 +5062,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4533,14 +5071,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB11_3 +; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4550,7 +5088,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -4561,15 +5099,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4582,7 +5120,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -4595,8 +5133,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4604,7 +5142,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB11_3 +; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -4612,7 +5150,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -4620,22 +5158,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -4652,22 +5190,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -4682,22 +5220,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -4715,26 +5253,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -4743,65 +5277,57 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -4809,33 +5335,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -4846,38 +5368,34 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4893,41 +5411,37 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4944,40 +5458,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -4994,21 +5508,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5023,21 +5537,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5054,25 +5568,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5080,97 +5590,85 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX908-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5180,39 +5678,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5228,41 +5722,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5279,27 +5769,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5308,7 +5798,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5322,14 +5812,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5339,7 +5829,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5354,8 +5844,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5364,18 +5854,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB14_3 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5388,21 +5878,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5416,8 +5906,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5425,19 +5915,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB14_3 +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5451,14 +5941,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5467,7 +5957,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5482,8 +5972,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5493,19 +5983,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5518,13 +6008,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -5532,7 +6022,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -5546,8 +6036,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5557,18 +6047,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_3 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -5581,20 +6071,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -5607,8 +6097,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5616,18 +6106,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -5640,21 +6130,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_max_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -5667,8 +6157,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5676,18 +6166,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_3 +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -5700,15 +6190,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -5718,7 +6208,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -5731,8 +6221,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5740,18 +6230,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB14_3 +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5763,7 +6253,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5775,9 +6265,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -5793,7 +6283,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5806,8 +6296,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5817,19 +6307,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB14_3 +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5841,7 +6331,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5853,9 +6343,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -5872,7 +6362,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5885,8 +6375,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5896,7 +6386,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB14_3 +; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -5904,7 +6394,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -5912,23 +6402,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -5961,25 +6451,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -6006,16 +6496,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6024,7 +6514,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6058,28 +6548,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6095,42 +6581,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6145,40 +6627,36 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6193,39 +6671,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6242,44 +6716,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6293,39 +6763,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6340,39 +6806,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6403,24 +6869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6447,23 +6913,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6495,27 +6961,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6530,42 +6992,38 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6579,40 +7037,36 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6626,39 +7080,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6674,45 +7124,41 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6726,39 +7172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6773,26 +7215,26 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6801,7 +7243,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -6815,15 +7257,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6847,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -6862,8 +7304,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6872,18 +7314,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -6896,7 +7338,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -6904,9 +7346,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_max_f32_e32 v4, v4, v9 @@ -6927,7 +7369,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -6941,8 +7383,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6950,19 +7392,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -6976,16 +7418,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7009,7 +7451,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7024,8 +7466,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7035,20 +7477,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7061,14 +7503,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7089,7 +7531,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7103,8 +7545,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7114,18 +7556,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7138,7 +7580,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7146,9 +7588,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9 @@ -7167,7 +7609,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7180,8 +7622,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7189,18 +7631,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7213,7 +7655,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7221,9 +7663,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7243,7 +7685,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7256,8 +7698,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7265,18 +7707,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7289,15 +7731,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 @@ -7320,7 +7762,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7333,8 +7775,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7342,18 +7784,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7365,7 +7807,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7376,9 +7818,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7392,7 +7834,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7405,8 +7847,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7415,19 +7857,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7439,7 +7881,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7450,9 +7892,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7466,7 +7908,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7479,8 +7921,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7490,14 +7932,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB17_3 +; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -7505,21 +7947,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -7536,22 +7978,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7565,21 +8007,21 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -7597,26 +8039,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -7625,33 +8063,29 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -7659,33 +8093,29 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -7693,32 +8123,28 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -7726,32 +8152,28 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 @@ -7759,32 +8181,28 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -7793,22 +8211,22 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst + %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } - +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index cd01cc7309fcd2..b18826e5b46a00 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -13,15 +13,15 @@ ; float ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN @@ -29,13 +29,13 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -58,10 +58,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -69,35 +69,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -105,31 +97,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -138,31 +126,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -171,57 +155,49 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 @@ -229,12 +205,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 @@ -257,10 +233,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -268,65 +244,53 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -334,31 +298,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -366,51 +326,43 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB1_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -444,7 +396,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -505,7 +457,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -535,7 +487,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, exec_lo @@ -564,7 +516,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 @@ -623,7 +575,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 @@ -683,7 +635,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 @@ -743,7 +695,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[6:7], exec @@ -769,7 +721,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[6:7], exec @@ -796,863 +748,1553 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] -; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB3_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 -; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v7 -; GFX908-NEXT: v_mov_b32_e32 v1, v8 -; GFX908-NEXT: v_mov_b32_e32 v2, v9 -; GFX908-NEXT: v_mov_b32_e32 v3, v10 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v10, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-NEXT: v_mov_b32_e32 v2, v9 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v6, s4 -; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 +; GFX940-NEXT: s_mov_b64 s[4:5], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, s6 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX11-NEXT: s_add_i32 s4, s6, 0x800 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v6, s4 -; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 -; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v1, s4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 -; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s18 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s18 -; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX908-NEXT: s_add_i32 s6, s18, 0x800 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s6 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, s10 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v10, v3 -; GFX908-NEXT: v_mov_b32_e32 v9, v2 -; GFX908-NEXT: v_mov_b32_e32 v8, v1 -; GFX908-NEXT: v_mov_b32_e32 v7, v0 -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX908-NEXT: v_mov_b32_e32 v5, v0 +; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX908-NEXT: v_mov_b32_e32 v2, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v8 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX8-NEXT: s_add_i32 s6, s18, 0x800 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, v0 -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc +; GFX8-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v7 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v8 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_cbranch_execnz .LBB4_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] -; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v9 -; GFX12-NEXT: v_readfirstlane_b32 s5, v10 -; GFX12-NEXT: v_readfirstlane_b32 s6, v7 -; GFX12-NEXT: v_readfirstlane_b32 s7, v8 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB5_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v6 -; GFX940-NEXT: v_mov_b32_e32 v6, v5 -; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: v_readfirstlane_b32 s4, v0 -; GFX940-NEXT: v_readfirstlane_b32 s5, v1 -; GFX940-NEXT: v_readfirstlane_b32 s6, v2 -; GFX940-NEXT: v_readfirstlane_b32 s7, v3 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] -; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 -; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX940-NEXT: ; implicit-def: $vgpr4 -; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: -; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB5_4 Depth 2 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 -; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v10 -; GFX11-NEXT: v_readfirstlane_b32 s6, v7 -; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB5_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_readfirstlane_b32 s8, v0 -; GFX10-NEXT: v_readfirstlane_b32 s9, v1 -; GFX10-NEXT: v_readfirstlane_b32 s10, v2 -; GFX10-NEXT: v_readfirstlane_b32 s11, v3 -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB5_1 -; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v6, v5 -; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 -; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX90A-NEXT: ; implicit-def: $vgpr4 -; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 -; GFX90A-NEXT: ; %bb.2: -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v10, v3 +; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_mov_b32_e32 v8, v1 +; GFX908-NEXT: v_mov_b32_e32 v7, v0 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v2, v7 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: v_mov_b32_e32 v3, v8 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v7 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v3, v8 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 +; GFX12-NEXT: ; implicit-def: $vgpr4 +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-NEXT: v_readfirstlane_b32 s4, v9 +; GFX12-NEXT: v_readfirstlane_b32 s5, v10 +; GFX12-NEXT: v_readfirstlane_b32 s6, v7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX12-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_3 +; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_readfirstlane_b32 s4, v0 +; GFX940-NEXT: v_readfirstlane_b32 s5, v1 +; GFX940-NEXT: v_readfirstlane_b32 s6, v2 +; GFX940-NEXT: v_readfirstlane_b32 s7, v3 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] +; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX940-NEXT: ; implicit-def: $vgpr4 +; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: +; GFX940-NEXT: s_mov_b64 exec, s[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 +; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_readfirstlane_b32 s8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s9, v1 +; GFX10-NEXT: v_readfirstlane_b32 s10, v2 +; GFX10-NEXT: v_readfirstlane_b32 s11, v3 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] +; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-NEXT: ; implicit-def: $vgpr4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: ; %bb.2: +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 +; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4 +; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v8, v3 +; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX908-NEXT: ; implicit-def: $vgpr4 +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Loop Header: Depth=1 +; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX908-NEXT: v_readfirstlane_b32 s8, v9 +; GFX908-NEXT: v_readfirstlane_b32 s9, v10 +; GFX908-NEXT: v_readfirstlane_b32 s10, v7 +; GFX908-NEXT: v_readfirstlane_b32 s11, v8 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX908-NEXT: v_mov_b32_e32 v14, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v13, v0 +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB7_3 +; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 +; GFX8-NEXT: ; implicit-def: $vgpr4 +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: +; GFX8-NEXT: s_mov_b64 exec, s[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Loop Header: Depth=1 +; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 +; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX8-NEXT: v_readfirstlane_b32 s8, v9 +; GFX8-NEXT: v_readfirstlane_b32 s9, v10 +; GFX8-NEXT: v_readfirstlane_b32 s10, v7 +; GFX8-NEXT: v_readfirstlane_b32 s11, v8 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] +; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc +; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 +; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] +; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v13, v0 +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB7_3 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[6:7], exec +; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_readfirstlane_b32 s8, v0 +; GFX7-NEXT: v_readfirstlane_b32 s9, v1 +; GFX7-NEXT: v_readfirstlane_b32 s10, v2 +; GFX7-NEXT: v_readfirstlane_b32 s11, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX7-NEXT: ; implicit-def: $vgpr4 +; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s9, v1 +; GFX6-NEXT: v_readfirstlane_b32 s10, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] +; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc +; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX6-NEXT: ; implicit-def: $vgpr4 +; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v10, v1 +; GFX908-NEXT: v_mov_b32_e32 v9, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result +} + +define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v6, s4 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x800 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v8, v3 -; GFX908-NEXT: v_mov_b32_e32 v7, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_add_i32 s10, s8, 0x800 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v10, v1 ; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 -; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX908-NEXT: ; implicit-def: $vgpr4 -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_1 -; GFX908-NEXT: ; %bb.2: -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 -; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_readfirstlane_b32 s8, v9 -; GFX908-NEXT: v_readfirstlane_b32 s9, v10 -; GFX908-NEXT: v_readfirstlane_b32 s10, v7 -; GFX908-NEXT: v_readfirstlane_b32 s11, v8 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB5_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX908-NEXT: s_mov_b64 exec, s[12:13] +; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v7 +; GFX908-NEXT: v_mov_b32_e32 v1, v8 +; GFX908-NEXT: v_mov_b32_e32 v2, v9 +; GFX908-NEXT: v_mov_b32_e32 v3, v10 +; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX908-NEXT: v_mov_b32_e32 v14, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v13, v0 ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB5_3 -; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_add_i32 s10, s8, 0x800 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s10 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 -; GFX8-NEXT: ; implicit-def: $vgpr4 -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_1 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_mov_b64 exec, s[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] -; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start -; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB5_4 Depth 2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 -; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1 -; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_readfirstlane_b32 s8, v9 -; GFX8-NEXT: v_readfirstlane_b32 s9, v10 -; GFX8-NEXT: v_readfirstlane_b32 s10, v7 -; GFX8-NEXT: v_readfirstlane_b32 s11, v8 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] -; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] -; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc -; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB5_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v7 +; GFX8-NEXT: v_mov_b32_e32 v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v3, v10 +; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] -; GFX8-NEXT: v_mov_b32_e32 v14, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v13, v0 ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB5_3 -; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_readfirstlane_b32 s8, v0 -; GFX7-NEXT: v_readfirstlane_b32 s9, v1 -; GFX7-NEXT: v_readfirstlane_b32 s10, v2 -; GFX7-NEXT: v_readfirstlane_b32 s11, v3 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX7-NEXT: ; implicit-def: $vgpr4 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB5_1 -; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_readfirstlane_b32 s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s9, v1 -; GFX6-NEXT: v_readfirstlane_b32 s10, v2 -; GFX6-NEXT: v_readfirstlane_b32 s11, v3 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc -; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GFX6-NEXT: ; implicit-def: $vgpr4 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB5_1 -; GFX6-NEXT: ; %bb.2: -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result } @@ -1660,27 +2302,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a ; half ; -------------------------------------------------------------------- -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1703,26 +2344,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -1739,29 +2380,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -1784,289 +2424,264 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2089,25 +2704,25 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 @@ -2124,28 +2739,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -2168,261 +2782,237 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, s10, v0 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2438,7 +3028,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -2451,14 +3041,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -2474,7 +3064,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -2489,8 +3079,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2499,13 +3089,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB8_3 +; GFX12-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2516,7 +3106,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -2528,14 +3118,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2545,7 +3135,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -2559,8 +3149,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB8_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB12_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2568,13 +3158,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB8_3 +; GFX940-NEXT: s_cbranch_execnz .LBB12_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2587,7 +3177,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -2600,14 +3190,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -2622,7 +3212,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -2637,8 +3227,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2648,13 +3238,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB8_3 +; GFX11-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -2665,7 +3255,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -2677,13 +3267,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -2694,7 +3284,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -2708,8 +3298,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB8_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB12_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -2719,13 +3309,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB8_3 +; GFX10-NEXT: s_cbranch_execnz .LBB12_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2736,7 +3326,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -2748,14 +3338,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 -; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2764,7 +3354,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -2777,8 +3367,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -2786,13 +3376,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB8_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -2803,7 +3393,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -2815,14 +3405,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2832,7 +3422,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -2845,8 +3435,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB8_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB12_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2854,13 +3444,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB8_3 +; GFX908-NEXT: s_cbranch_execnz .LBB12_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -2871,7 +3461,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -2883,14 +3473,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 @@ -2901,7 +3491,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -2914,8 +3504,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB8_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB12_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2923,13 +3513,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB8_3 +; GFX8-NEXT: s_cbranch_execnz .LBB12_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -2939,7 +3529,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -2950,15 +3540,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -2970,7 +3560,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -2983,8 +3573,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB8_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB12_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -2992,14 +3582,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB8_3 +; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -3009,7 +3599,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -3020,15 +3610,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 -; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB8_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -3040,7 +3630,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1 +; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -3053,8 +3643,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB8_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB12_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -3062,7 +3652,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB8_3 +; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -3070,7 +3660,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -3078,27 +3668,26 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add ; bfloat ; -------------------------------------------------------------------- -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3127,27 +3716,27 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v4, s5 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3169,29 +3758,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v4, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3221,33 +3809,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 @@ -3255,133 +3839,121 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v2 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v4, s9 +; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 @@ -3391,141 +3963,132 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v2, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v4, s9 +; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v2 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s6, 0x200 +; GFX12-NEXT: s_addk_co_i32 s4, 0x200 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_and_b32 s4, s6, -4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s6, 3 +; GFX12-NEXT: s_and_b32 s5, s4, -4 +; GFX12-NEXT: s_and_b32 s4, s4, 3 +; GFX12-NEXT: v_mov_b32_e32 v2, s5 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_not_b32 s6, s5 +; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3554,26 +4117,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_addk_i32 s6, 0x200 -; GFX940-NEXT: s_and_b32 s4, s6, -4 -; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: s_addk_i32 s4, 0x200 +; GFX940-NEXT: s_and_b32 s5, s4, -4 +; GFX940-NEXT: v_mov_b32_e32 v2, s5 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen -; GFX940-NEXT: s_and_b32 s4, s6, 3 +; GFX940-NEXT: s_and_b32 s4, s4, 3 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX940-NEXT: s_not_b32 s7, s4 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3595,28 +4158,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s6, 0x200 +; GFX11-NEXT: s_addk_i32 s4, 0x200 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s6, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s6, 3 +; GFX11-NEXT: s_and_b32 s5, s4, -4 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_not_b32 s6, s5 +; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 @@ -3646,32 +4208,28 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_addk_i32 s18, 0x200 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_and_b32 s4, s18, -4 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_and_b32 s4, s18, 3 +; GFX10-NEXT: s_addk_i32 s8, 0x200 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_not_b32 s6, s5 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: s_and_b32 s9, s8, -4 +; GFX10-NEXT: s_and_b32 s8, s8, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_lshl_b32 s8, s8, 3 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8 +; GFX10-NEXT: s_not_b32 s10, s9 +; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 @@ -3679,130 +4237,118 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_addk_i32 s18, 0x200 -; GFX90A-NEXT: s_and_b32 s4, s18, -4 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX90A-NEXT: s_and_b32 s4, s18, 3 -; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 -; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX90A-NEXT: s_not_b32 s7, s4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_addk_i32 s8, 0x200 +; GFX90A-NEXT: s_and_b32 s9, s8, -4 +; GFX90A-NEXT: v_mov_b32_e32 v2, s9 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX90A-NEXT: s_and_b32 s8, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s10, s8, 3 +; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX90A-NEXT: s_not_b32 s11, s8 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff -; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_addk_i32 s18, 0x200 -; GFX908-NEXT: s_and_b32 s4, s18, -4 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 -; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX908-NEXT: s_and_b32 s4, s18, 3 -; GFX908-NEXT: s_lshl_b32 s6, s4, 3 -; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX908-NEXT: s_not_b32 s7, s4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_addk_i32 s8, 0x200 +; GFX908-NEXT: s_and_b32 s9, s8, -4 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX908-NEXT: s_and_b32 s8, s8, 3 +; GFX908-NEXT: s_lshl_b32 s10, s8, 3 +; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX908-NEXT: s_not_b32 s11, s8 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 +; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_addk_i32 s18, 0x200 -; GFX8-NEXT: s_and_b32 s4, s18, -4 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX8-NEXT: s_and_b32 s4, s18, 3 -; GFX8-NEXT: s_lshl_b32 s6, s4, 3 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX8-NEXT: s_not_b32 s7, s4 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_addk_i32 s8, 0x200 +; GFX8-NEXT: s_and_b32 s9, s8, -4 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX8-NEXT: s_and_b32 s8, s8, 3 +; GFX8-NEXT: s_lshl_b32 s10, s8, 3 +; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10 +; GFX8-NEXT: s_not_b32 s11, s8 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 @@ -3812,117 +4358,109 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX8-NEXT: v_and_b32_e32 v4, s11, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_addk_i32 s18, 0x200 -; GFX7-NEXT: s_and_b32 s4, s18, -4 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX7-NEXT: s_and_b32 s4, s18, 3 -; GFX7-NEXT: s_lshl_b32 s6, s4, 3 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX7-NEXT: s_addk_i32 s8, 0x200 +; GFX7-NEXT: s_and_b32 s9, s8, -4 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX7-NEXT: s_and_b32 s8, s8, 3 +; GFX7-NEXT: s_lshl_b32 s10, s8, 3 +; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_not_b32 s7, s4 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_not_b32 s11, s8 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v1, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_addk_i32 s18, 0x200 -; GFX6-NEXT: s_and_b32 s4, s18, -4 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen -; GFX6-NEXT: s_and_b32 s4, s18, 3 -; GFX6-NEXT: s_lshl_b32 s6, s4, 3 -; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 +; GFX6-NEXT: s_addk_i32 s8, 0x200 +; GFX6-NEXT: s_and_b32 s9, s8, -4 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen +; GFX6-NEXT: s_and_b32 s8, s8, 3 +; GFX6-NEXT: s_lshl_b32 s10, s8, 3 +; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_not_b32 s11, s8 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3938,7 +4476,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -3951,14 +4489,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -3980,7 +4518,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -3995,8 +4533,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4005,13 +4543,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB11_3 +; GFX12-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4022,7 +4560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 ; GFX940-NEXT: v_not_b32_e32 v10, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -4034,15 +4572,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff -; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX940-NEXT: s_mov_b64 s[8:9], exec @@ -4057,7 +4595,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -4071,8 +4609,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB15_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4080,13 +4618,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB11_3 +; GFX940-NEXT: s_cbranch_execnz .LBB15_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4099,7 +4637,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -4112,15 +4650,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -4142,7 +4680,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -4157,8 +4695,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4168,14 +4706,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 @@ -4186,7 +4724,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff ; GFX10-NEXT: v_not_b32_e32 v9, v6 -; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -4198,13 +4736,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -4219,7 +4757,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -4233,8 +4771,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB11_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB15_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -4244,13 +4782,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB11_3 +; GFX10-NEXT: s_cbranch_execnz .LBB15_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4261,7 +4799,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 ; GFX90A-NEXT: v_not_b32_e32 v10, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -4273,15 +4811,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff -; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11 @@ -4294,7 +4832,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -4307,8 +4845,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -4316,13 +4854,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB11_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 @@ -4333,7 +4871,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX908-NEXT: v_not_b32_e32 v9, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -4345,15 +4883,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff -; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX908-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4367,7 +4905,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -4380,8 +4918,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB11_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB15_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4389,13 +4927,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB11_3 +; GFX908-NEXT: s_cbranch_execnz .LBB15_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 @@ -4406,7 +4944,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 ; GFX8-NEXT: v_not_b32_e32 v9, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -4418,14 +4956,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f32_e32 v4, v4, v10 @@ -4441,7 +4979,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -4454,8 +4992,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB11_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB15_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4463,13 +5001,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB11_3 +; GFX8-NEXT: s_cbranch_execnz .LBB15_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4479,7 +5017,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX7-NEXT: v_not_b32_e32 v9, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -4490,15 +5028,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4511,7 +5049,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v5, v6 -; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -4524,8 +5062,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB11_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB15_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4533,14 +5071,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB11_3 +; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 @@ -4550,7 +5088,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_not_b32_e32 v9, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -4561,15 +5099,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 -; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB11_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4582,7 +5120,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v5, v6 -; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1 +; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -4595,8 +5133,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB11_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB15_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -4604,7 +5142,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB11_3 +; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 @@ -4612,7 +5150,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -4620,22 +5158,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -4652,22 +5190,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -4682,22 +5220,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -4715,26 +5253,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -4743,65 +5277,57 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -4809,33 +5335,29 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -4846,38 +5368,34 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4893,41 +5411,37 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -4944,40 +5458,40 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 @@ -4994,21 +5508,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5023,21 +5537,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: v_mov_b32_e32 v1, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 ; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5054,25 +5568,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 @@ -5080,97 +5590,85 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v5, v1 ; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX908-NEXT: v_mov_b32_e32 v1, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -5180,39 +5678,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5228,41 +5722,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -5279,27 +5769,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace( ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5308,7 +5798,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -5322,14 +5812,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 ; GFX12-NEXT: s_mov_b32 s2, exec_lo @@ -5339,7 +5829,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -5354,8 +5844,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5364,18 +5854,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB14_3 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -5388,21 +5878,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX940-NEXT: s_mov_b64 s[8:9], exec ; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -5416,8 +5906,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5425,19 +5915,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB14_3 +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -5451,14 +5941,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX11-NEXT: s_mov_b32 s2, exec_lo @@ -5467,7 +5957,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -5482,8 +5972,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5493,19 +5983,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -5518,13 +6008,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX10-NEXT: s_mov_b32 s6, exec_lo @@ -5532,7 +6022,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -5546,8 +6036,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -5557,18 +6047,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB14_3 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -5581,20 +6071,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 -; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 ; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -5607,8 +6097,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -5616,18 +6106,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -5640,21 +6130,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 -; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 ; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -5667,8 +6157,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5676,18 +6166,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB14_3 +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -5700,15 +6190,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 -; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 @@ -5718,7 +6208,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -5731,8 +6221,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB14_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -5740,18 +6230,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB14_3 +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -5763,7 +6253,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5775,9 +6265,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: s_mov_b64 s[12:13], exec @@ -5793,7 +6283,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -5806,8 +6296,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB14_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5817,19 +6307,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB14_3 +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -5841,7 +6331,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -5853,9 +6343,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB14_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: s_mov_b64 s[12:13], exec @@ -5872,7 +6362,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1 +; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -5885,8 +6375,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB18_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 @@ -5896,7 +6386,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB14_3 +; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v4 @@ -5904,7 +6394,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -5912,23 +6402,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: v_mov_b32_e32 v4, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 @@ -5961,25 +6451,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v0 @@ -6006,16 +6496,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6024,7 +6514,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v0 @@ -6058,28 +6548,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v0 @@ -6095,42 +6581,38 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v0 @@ -6145,40 +6627,36 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v0 @@ -6193,39 +6671,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v0, v5 ; GFX908-NEXT: v_mov_b32_e32 v1, v6 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6242,44 +6716,40 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6293,39 +6763,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v4, s8 +; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6340,39 +6806,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6403,24 +6869,24 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_add_i32 s4, s6, 0x400 +; GFX940-NEXT: s_addk_i32 s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[6:7], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302 ; GFX940-NEXT: v_mov_b32_e32 v4, s4 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6447,23 +6913,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX940-NEXT: v_mov_b32_e32 v1, v6 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 @@ -6495,27 +6961,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v4, s8 +; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6530,42 +6992,38 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s18 -; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s4, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s8 +; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_addk_i32 s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[10:11], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6579,40 +7037,36 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v1, s18 -; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s4, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_addk_i32 s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[10:11], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302 -; GFX908-NEXT: v_mov_b32_e32 v4, s4 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6626,39 +7080,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13 ; GFX908-NEXT: v_mov_b32_e32 v6, v1 ; GFX908-NEXT: v_mov_b32_e32 v5, v0 -; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s4, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_addk_i32 s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 @@ -6674,45 +7124,41 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_addk_i32 s8, 0x400 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b64 s[10:11], 0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6726,39 +7172,35 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_addk_i32 s8, 0x400 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: s_mov_b64 s[10:11], 0 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -6773,26 +7215,26 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6801,7 +7243,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 @@ -6815,15 +7257,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 ; GFX12-NEXT: ; implicit-def: $vgpr4 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; GFX12-NEXT: s_mov_b32 exec_lo, s1 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -6847,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX12-NEXT: v_mov_b32_e32 v4, v5 ; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -6862,8 +7304,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX12-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -6872,18 +7314,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX940-NEXT: s_mov_b64 s[2:3], exec -; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2 @@ -6896,7 +7338,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 ; GFX940-NEXT: ; implicit-def: $vgpr4 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_mov_b64 s[2:3], 0 @@ -6904,9 +7346,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_movk_i32 s10, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302 -; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Loop Header: Depth=1 -; GFX940-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 @@ -6927,7 +7369,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] -; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1 @@ -6941,8 +7383,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_4 -; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX940-NEXT: s_mov_b64 exec, s[8:9] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -6950,19 +7392,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX940-NEXT: v_mov_b32_e32 v7, v4 ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2 @@ -6976,16 +7418,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 ; GFX11-NEXT: ; implicit-def: $vgpr4 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 @@ -7009,7 +7451,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX11-NEXT: v_mov_b32_e32 v4, v5 ; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7024,8 +7466,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7035,20 +7477,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, exec_lo -; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2 @@ -7061,14 +7503,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: ; implicit-def: $vgpr4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 ; GFX10-NEXT: ; %bb.2: ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Loop Header: Depth=1 -; GFX10-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 @@ -7089,7 +7531,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_mov_b32_e32 v5, v6 -; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1 @@ -7103,8 +7545,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_4 -; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 @@ -7114,18 +7556,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec -; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 @@ -7138,7 +7580,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX90A-NEXT: ; implicit-def: $vgpr4 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; GFX90A-NEXT: s_mov_b64 exec, s[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 @@ -7146,9 +7588,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 -; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 @@ -7167,7 +7609,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 @@ -7180,8 +7622,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 -; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 @@ -7189,18 +7631,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX90A-NEXT: v_mov_b32_e32 v7, v4 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 ; GFX908-NEXT: s_mov_b64 s[6:7], exec -; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2 @@ -7213,7 +7655,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX908-NEXT: ; implicit-def: $vgpr4 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; GFX908-NEXT: s_mov_b64 exec, s[6:7] ; GFX908-NEXT: s_mov_b64 s[6:7], 0 @@ -7221,9 +7663,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_movk_i32 s14, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302 -; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7243,7 +7685,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec ; GFX908-NEXT: v_mov_b32_e32 v5, v6 -; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1 @@ -7256,8 +7698,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_4 -; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX908-NEXT: s_mov_b64 exec, s[12:13] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7265,18 +7707,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 ; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2 @@ -7289,15 +7731,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 ; GFX8-NEXT: ; implicit-def: $vgpr4 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 -; GFX8-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 @@ -7320,7 +7762,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 -; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1 @@ -7333,8 +7775,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_4 -; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX8-NEXT: s_mov_b64 exec, s[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 @@ -7342,18 +7784,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec -; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2 @@ -7365,7 +7807,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX7-NEXT: ; implicit-def: $vgpr4 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7376,9 +7818,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_mov_b64 s[6:7], 0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 -; GFX7-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7392,7 +7834,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1 @@ -7405,8 +7847,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_4 -; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7415,19 +7857,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v0, v7 ; GFX7-NEXT: v_mov_b32_e32 v1, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall: +; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2 @@ -7439,7 +7881,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 ; GFX6-NEXT: ; implicit-def: $vgpr4 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 @@ -7450,9 +7892,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 -; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start +; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 -; GFX6-NEXT: ; Child Loop BB17_4 Depth 2 +; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 @@ -7466,7 +7908,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1 +; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1 @@ -7479,8 +7921,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_4 -; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1 +; GFX6-NEXT: s_cbranch_execnz .LBB21_4 +; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -7490,14 +7932,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_cbranch_execnz .LBB17_3 +; GFX6-NEXT: s_cbranch_execnz .LBB21_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, v7 ; GFX6-NEXT: v_mov_b32_e32 v1, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -7505,21 +7947,21 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; misc ; -------------------------------------------------------------------- -define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { +; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400 +; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_addk_co_i32 s4, 0x400 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 ; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v5, v0 @@ -7536,22 +7978,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 -; GFX940-NEXT: s_addk_i32 s6, 0x400 +; GFX940-NEXT: s_add_i32 s6, s4, 0x400 ; GFX940-NEXT: s_mov_b64 s[4:5], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 ; GFX940-NEXT: v_mov_b32_e32 v3, s6 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v0 @@ -7565,21 +8007,21 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: s_add_i32 s4, s6, 0x400 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 @@ -7597,26 +8039,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: s_add_i32 s4, s18, 0x400 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -7625,33 +8063,29 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: v_mov_b32_e32 v0, s18 -; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX90A-NEXT: s_add_i32 s6, s18, 0x400 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x400 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v0 @@ -7659,33 +8093,29 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: v_mov_b32_e32 v0, s18 -; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX908-NEXT: s_add_i32 s6, s18, 0x400 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s8 +; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX908-NEXT: s_add_i32 s10, s8, 0x400 +; GFX908-NEXT: s_mov_b64 s[8:9], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, s6 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_mov_b32_e32 v3, s10 +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, v0 @@ -7693,32 +8123,28 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 -; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX8-NEXT: s_add_i32 s6, s18, 0x400 -; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX8-NEXT: s_add_i32 s10, s8, 0x400 +; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 @@ -7726,32 +8152,28 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 -; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, v0 -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX7-NEXT: s_add_i32 s6, s18, 0x400 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, v0 @@ -7759,32 +8181,28 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset: +; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, v0 -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024 -; GFX6-NEXT: s_add_i32 s6, s18, 0x400 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v5, v0 @@ -7793,22 +8211,22 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) ; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v1, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB18_1 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 - %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst + %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } - +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll index 08a997530d3c94..16f29cc329976c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; GISEL-LABEL: buffer_ptr_vector_ops: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; ; SDAG-LABEL: buffer_ptr_vector_ops: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -60,16 +60,16 @@ main_body: define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace(1) %dest) { ; GISEL-LABEL: buffer_structs: ; GISEL: ; %bb.0: ; %main_body -; GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GISEL-NEXT: v_mov_b32_e32 v5, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_ashr_i32 s1, s0, 31 -; GISEL-NEXT: v_mov_b32_e32 v4, s0 -; GISEL-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GISEL-NEXT: s_ashr_i32 s3, s2, 31 +; GISEL-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 ; GISEL-NEXT: s_add_u32 s0, s8, s0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v4, s2 ; GISEL-NEXT: s_addc_u32 s1, s9, s1 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -81,15 +81,15 @@ define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace ; ; SDAG-LABEL: buffer_structs: ; SDAG: ; %bb.0: ; %main_body -; SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_ashr_i32 s1, s0, 31 -; SDAG-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; SDAG-NEXT: s_ashr_i32 s3, s2, 31 +; SDAG-NEXT: s_lshl_b64 s[0:1], s[2:3], 5 ; SDAG-NEXT: s_add_u32 s0, s8, s0 +; SDAG-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-NEXT: s_addc_u32 s1, s9, s1 ; SDAG-NEXT: buffer_store_dword v0, v0, s[4:7], 0 offen ; SDAG-NEXT: global_store_dword v4, v0, s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 8293280609517a..b26d15ed3a1c8a 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector2: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -19,7 +19,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector2: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -40,7 +40,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector2: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -52,7 +52,7 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector2: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -67,7 +67,7 @@ entry: define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector4: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 5 @@ -80,7 +80,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 6 ; GFX8-NEXT: v_mov_b32_e32 v2, 7 @@ -93,7 +93,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector4: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, 6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector4: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 5 ; GFX11-NEXT: v_mov_b32_e32 v1, 6 @@ -119,7 +119,7 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector4: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: v_mov_b32_e32 v0, 5 ; GFX940-NEXT: v_mov_b32_e32 v1, 6 @@ -136,7 +136,7 @@ entry: define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; GFX6-LABEL: build_vector_v2i16: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005 @@ -146,7 +146,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX8-LABEL: build_vector_v2i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX10-LABEL: build_vector_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -165,7 +165,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX11-LABEL: build_vector_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) { ; ; GFX940-LABEL: build_vector_v2i16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -190,8 +190,8 @@ entry: define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: build_vector_v2i16_trunc: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -201,10 +201,10 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX8-LABEL: build_vector_v2i16_trunc: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s2, s4, 16 +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s2, 0x50000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -215,11 +215,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX10-LABEL: build_vector_v2i16_trunc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s2, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -228,11 +228,11 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX11-LABEL: build_vector_v2i16_trunc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hl_b32_b16 s2, s4, 5 +; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,14 +242,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; ; GFX940-LABEL: build_vector_v2i16_trunc: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshr_b32 s2, s4, 16 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 5 -; GFX940-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm %srl = lshr i32 %a, 16 %trunc = trunc i32 %srl to i16 @@ -262,7 +262,7 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) { ; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -277,7 +277,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -290,7 +290,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 @@ -302,7 +302,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 @@ -316,7 +316,7 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s3, s3, 16 diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll index 5d1647782b0d8f..f1992d71eb1de8 100644 --- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 { ; GCN-NEXT: v_and_b32_e32 [[TMP:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: v_add_i32_e32 v0, vcc, [[TMP]], v0 ; GCN-NEXT: s_setpc_b64 -define hidden i32 @use_workitem_id_x(i32 %arg0) #3 { +define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %op = add i32 %id, %arg0 ret i32 %op @@ -64,7 +64,7 @@ define hidden i32 @use_workitem_id_x(i32 %arg0) #3 { ; GCN: v_mov_b32_e32 v0, 9 ; GCN: s_swappc_b64 ; GCN: v_add_f32_e32 -define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #3 { +define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 { %val = call float @use_workitem_id_x(i32 9) %op = fadd float %val, 1.0 store volatile float %op, ptr addrspace(1) undef @@ -112,4 +112,3 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind noinline } attributes #1 = { alwaysinline nounwind } attributes #2 = { nounwind readnone speculatable } -attributes #3 = { nounwind noinline "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 35a1547f3f3ae8..4368af1a611120 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -273,8 +273,8 @@ entry: ret void } -attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind noinline norecurse } +attributes #1 = { nounwind noinline norecurse } attributes #2 = { nounwind noinline } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index 06dec7e792389f..c62a0824591050 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -10,9 +10,9 @@ declare hidden void @callee() #0 define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_size !0 { ; CHECK-LABEL: known_x_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 @@ -30,9 +30,9 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_size !1 { ; CHECK-LABEL: known_y_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -49,9 +49,9 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_size !2 { ; CHECK-LABEL: known_z_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -68,9 +68,9 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_size !3 { ; CHECK-LABEL: known_yz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -87,9 +87,9 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_size !4 { ; CHECK-LABEL: known_xz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 ; CHECK-NEXT: s_mov_b32 s32, 0 @@ -107,9 +107,9 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_size !5 { ; CHECK-LABEL: known_xyz_0: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v31, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 8ef2d89e76d4e1..b711542be5a7fc 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -1,7 +1,5 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index b52e7918b27ab1..1d2523d364e550 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -1,7 +1,5 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri < %s | llc -mcpu=gfx90a -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx90a -mattr=-xnack < %s | llc -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 9792c9dabac2f6..5e6f377da28e15 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,6 +1,4 @@ -; RUN: opt -mcpu=kaveri -passes=amdgpu-attributor < %s | llc -enable-ipra=0 | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s - -target triple = "amdgcn-amd-amdhsa" +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 231d3d97c8f4f3..15ebdd70ae8818 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -8,7 +8,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; SI-LABEL: kernel: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -18,7 +18,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; VI-LABEL: kernel: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -28,7 +28,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; GFX11-LABEL: kernel: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -115,32 +115,21 @@ define amdgpu_kernel void @call_coldcc() #0 { ; SI-LABEL: call_coldcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v31, v0, v2 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b64 s[0:1], s[20:21] -; SI-NEXT: s_mov_b64 s[2:3], s[22:23] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -152,49 +141,31 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s1 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_coldcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, coldcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, coldcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) @@ -206,32 +177,21 @@ define amdgpu_kernel void @call_fastcc() #0 { ; SI-LABEL: call_fastcc: ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s23, 0xe8f000 -; SI-NEXT: s_add_u32 s20, s20, s9 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_mov_b32 s14, s8 -; SI-NEXT: s_mov_b64 s[10:11], s[4:5] -; SI-NEXT: s_add_u32 s8, s2, 36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; SI-NEXT: s_addc_u32 s9, s3, 0 -; SI-NEXT: s_getpc_b64 s[2:3] -; SI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; SI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v31, v0, v2 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b64 s[0:1], s[20:21] -; SI-NEXT: s_mov_b64 s[2:3], s[22:23] +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -243,49 +203,31 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s90, -1 ; VI-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NEXT: s_add_u32 s88, s88, s9 +; VI-NEXT: s_add_u32 s88, s88, s1 ; VI-NEXT: s_addc_u32 s89, s89, 0 -; VI-NEXT: s_mov_b32 s14, s8 -; VI-NEXT: s_add_u32 s8, s2, 36 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: s_getpc_b64 s[2:3] -; VI-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; VI-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; VI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b64 s[10:11], s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_mov_b64 s[4:5], s[0:1] +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: call_fastcc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s8, s2, 36 -; GFX11-NEXT: s_addc_u32 s9, s3, 0 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, fastcc@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, fastcc@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s13, s14 -; GFX11-NEXT: s_mov_b32 s14, s15 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) @@ -1012,7 +954,7 @@ define amdgpu_ps i16 @ret_ps_mesa_i16() { define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; SI-LABEL: amd_kernel_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s0, s0, s0 @@ -1023,7 +965,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; VI-LABEL: amd_kernel_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1032,7 +974,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; GFX11-LABEL: amd_kernel_i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1050,7 +992,7 @@ entry: define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; SI-LABEL: amd_kernel_v2i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,7 +1010,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v2i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 @@ -1082,7 +1024,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v2i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 @@ -1107,7 +1049,7 @@ entry: define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; SI-LABEL: amd_kernel_v4i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 +; SI-NEXT: s_load_dword s1, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1135,7 +1077,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v4i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 24 ; VI-NEXT: s_lshr_b32 s2, s0, 16 @@ -1157,7 +1099,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v4i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 @@ -1194,7 +1136,7 @@ entry: define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; SI-LABEL: amd_kernel_v3i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1218,7 +1160,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v3i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1238,7 +1180,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v3i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1270,7 +1212,7 @@ entry: define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; SI-LABEL: amd_kernel_v5i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s0, 4 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1303,7 +1245,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_lshr_b32 s3, s0, 16 @@ -1331,7 +1273,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: s_lshr_b32 s2, s0, 16 @@ -1370,7 +1312,7 @@ entry: define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; SI-LABEL: amd_kernel_v8i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1415,7 +1357,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v8i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s1, 24 ; VI-NEXT: s_lshr_b32 s3, s1, 16 @@ -1450,7 +1392,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v8i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1 @@ -1503,7 +1445,7 @@ entry: define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; SI-LABEL: amd_kernel_v16i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1582,7 +1524,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v16i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 24 ; VI-NEXT: s_lshr_b32 s5, s3, 16 @@ -1643,7 +1585,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v16i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s10, s3, 16 ; GFX11-NEXT: s_lshr_b32 s11, s3, 24 @@ -1724,7 +1666,7 @@ entry: define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; SI-LABEL: amd_kernel_v32i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s9, 0 ; SI-NEXT: s_mov_b32 s8, 16 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1874,7 +1816,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v32i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v10, 0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1990,7 +1932,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v32i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2 ; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index f248708d16ea2a..a0499ef6d0f6ae 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -18,8 +18,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: sadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -34,8 +34,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; VI-LABEL: sadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_add_u32 s0, s6, s0 @@ -48,12 +48,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: sadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s6, s0 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -62,12 +62,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1010-LABEL: sadd64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_u32 s0, s6, s0 -; GFX1010-NEXT: s_addc_u32 s1, s7, s1 +; GFX1010-NEXT: s_add_u32 s0, s6, s2 +; GFX1010-NEXT: s_addc_u32 s1, s7, s3 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -76,8 +76,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W32-LABEL: sadd64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s0, s6, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W64-LABEL: sadd64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s0, s6, s0 @@ -104,8 +104,8 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: sadd64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s6, s0 ; GFX11-NEXT: s_addc_u32 s1, s7, s1 @@ -129,7 +129,7 @@ entry: define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: sadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -144,7 +144,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: sadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 0x56789876 @@ -157,7 +157,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: sadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -169,7 +169,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: sadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -181,7 +181,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: sadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -193,7 +193,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: sadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876 @@ -205,7 +205,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: sadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876 ; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234 @@ -229,7 +229,7 @@ entry: define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vadd64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -243,7 +243,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vadd64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vadd64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -266,7 +266,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: vadd64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: vadd64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0 @@ -286,7 +286,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vadd64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0 @@ -296,12 +296,11 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vadd64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -323,7 +322,7 @@ entry: define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vadd64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: v_add_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -335,7 +334,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vadd64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -347,7 +346,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vadd64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -358,8 +357,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vadd64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1010-NEXT: s_mov_b32 null, 0 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2 @@ -369,7 +367,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vadd64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 @@ -379,7 +377,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vadd64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3] @@ -389,11 +387,9 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vadd64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -415,8 +411,8 @@ entry: define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: suaddo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -427,23 +423,23 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: suaddo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: suaddo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_add_i32 s0, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -451,11 +447,11 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: suaddo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_add_i32 s0, s0, s1 +; GFX1010-NEXT: s_add_i32 s0, s2, s3 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: s_endpgm @@ -463,37 +459,37 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: suaddo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W32-NEXT: s_add_i32 s2, s2, s3 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: suaddo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W64-NEXT: s_add_i32 s2, s2, s3 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: suaddo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NEXT: s_add_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -517,28 +513,28 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: uaddo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s13 -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v0, s9 +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: uaddo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -553,12 +549,12 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: uaddo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -567,11 +563,11 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1010-LABEL: uaddo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v1, s0, s0, s1 +; GFX1010-NEXT: v_add_co_u32 v1, s0, s2, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] @@ -580,8 +576,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W32-LABEL: uaddo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5 @@ -593,8 +589,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: uaddo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5 @@ -606,8 +602,8 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: uaddo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5 @@ -635,7 +631,7 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: suaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -659,7 +655,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: suaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -679,7 +675,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: suaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s6, s4, s6 @@ -696,7 +692,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1010-LABEL: suaddo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_add_u32 s6, s4, s6 @@ -711,7 +707,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W32-LABEL: suaddo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_add_u32 s6, s4, s6 @@ -726,7 +722,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: suaddo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_add_u32 s6, s4, s6 @@ -741,7 +737,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: suaddo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s6, s4, s6 ; GFX11-NEXT: s_addc_u32 s7, s5, s7 @@ -772,31 +768,31 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vuaddo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s13 -; CISI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v1, s9 +; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 ; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: vuaddo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v6, s1 @@ -813,14 +809,14 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vuaddo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v2, v0, s[6:7] @@ -829,13 +825,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: vuaddo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_add_co_u32 v0, s2, s0, v0 -; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s1, 0, s2 -; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX1010-NEXT: v_add_co_u32 v0, s0, s2, v0 +; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] @@ -844,8 +840,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: vuaddo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0 @@ -859,8 +855,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vuaddo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0 @@ -874,15 +870,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vuaddo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 @@ -909,8 +903,8 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; CISI-LABEL: ssub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -925,8 +919,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; VI-LABEL: ssub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_sub_u32 s0, s6, s0 @@ -939,12 +933,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX9-LABEL: ssub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s1, s7, s1 +; GFX9-NEXT: s_sub_u32 s0, s6, s2 +; GFX9-NEXT: s_subb_u32 s1, s7, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -953,12 +947,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1010-LABEL: ssub64rr: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_u32 s0, s6, s0 -; GFX1010-NEXT: s_subb_u32 s1, s7, s1 +; GFX1010-NEXT: s_sub_u32 s0, s6, s2 +; GFX1010-NEXT: s_subb_u32 s1, s7, s3 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -967,8 +961,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W32-LABEL: ssub64rr: ; GFX1030W32: ; %bb.0: ; %entry ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0 @@ -981,8 +975,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1030W64-LABEL: ssub64rr: ; GFX1030W64: ; %bb.0: ; %entry ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0 @@ -995,8 +989,8 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-LABEL: ssub64rr: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s0, s6, s0 ; GFX11-NEXT: s_subb_u32 s1, s7, s1 @@ -1020,7 +1014,7 @@ entry: define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: ssub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,7 +1029,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: ssub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, 0x56789876, s2 @@ -1048,7 +1042,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: ssub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1060,7 +1054,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: ssub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1072,7 +1066,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: ssub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1084,7 +1078,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: ssub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2 @@ -1096,7 +1090,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: ssub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2 ; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3 @@ -1120,7 +1114,7 @@ entry: define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; CISI-LABEL: vsub64rr: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s7, 0xf000 ; CISI-NEXT: s_mov_b32 s6, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1134,7 +1128,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: vsub64rr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0 @@ -1146,7 +1140,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX9-LABEL: vsub64rr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1157,7 +1151,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1010-LABEL: vsub64rr: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0 @@ -1167,7 +1161,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W32-LABEL: vsub64rr: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0 @@ -1177,7 +1171,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1030W64-LABEL: vsub64rr: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0 @@ -1187,12 +1181,11 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX11-LABEL: vsub64rr: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1214,7 +1207,7 @@ entry: define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; CISI-LABEL: vsub64ri: ; CISI: ; %bb.0: ; %entry -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: v_sub_i32_e32 v0, vcc, 0x56789876, v0 ; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 ; CISI-NEXT: s_mov_b32 s3, 0xf000 @@ -1226,7 +1219,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; VI-LABEL: vsub64ri: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0x1234 ; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1238,7 +1231,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX9-LABEL: vsub64ri: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1249,8 +1242,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1010-LABEL: vsub64ri: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1010-NEXT: s_mov_b32 null, 0 +; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2 @@ -1260,7 +1252,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W32-LABEL: vsub64ri: ; GFX1030W32: ; %bb.0: ; %entry -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 @@ -1270,7 +1262,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1030W64-LABEL: vsub64ri: ; GFX1030W64: ; %bb.0: ; %entry -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3] @@ -1280,11 +1272,9 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX11-LABEL: vsub64ri: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1307,8 +1297,8 @@ entry: define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: susubo32: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; CISI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; CISI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s3, 0xf000 ; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1319,23 +1309,23 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: susubo32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sub_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_sub_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: susubo32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -1343,11 +1333,11 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: susubo32: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_sub_i32 s0, s0, s1 +; GFX1010-NEXT: s_sub_i32 s0, s2, s3 ; GFX1010-NEXT: v_mov_b32_e32 v1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: s_endpgm @@ -1355,37 +1345,37 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: susubo32: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_sub_i32 s0, s0, s1 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W32-NEXT: s_sub_i32 s2, s2, s3 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W32-NEXT: s_endpgm ; ; GFX1030W64-LABEL: susubo32: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_sub_i32 s0, s0, s1 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1030W64-NEXT: s_sub_i32 s2, s2, s3 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030W64-NEXT: s_endpgm ; ; GFX11-LABEL: susubo32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s0, s0, s1 +; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1409,28 +1399,28 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; CISI-LABEL: usubo32_vcc_user: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v0, s13 -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v0, s9 +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 ; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CISI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v1, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: usubo32_vcc_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1445,12 +1435,12 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: usubo32_vcc_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -1459,11 +1449,11 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1010-LABEL: usubo32_vcc_user: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v1, s0, s0, s1 +; GFX1010-NEXT: v_sub_co_u32 v1, s0, s2, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1010-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1010-NEXT: global_store_byte v0, v2, s[6:7] @@ -1472,8 +1462,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W32-LABEL: usubo32_vcc_user: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5 @@ -1485,8 +1475,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1030W64-LABEL: usubo32_vcc_user: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5 @@ -1498,8 +1488,8 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: usubo32_vcc_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5 @@ -1527,7 +1517,7 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; CISI-LABEL: susubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CISI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CISI-NEXT: s_mov_b32 s11, 0xf000 ; CISI-NEXT: s_mov_b32 s10, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) @@ -1551,7 +1541,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; VI-LABEL: susubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -1571,7 +1561,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: susubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s6, s4, s6 @@ -1588,7 +1578,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1010-LABEL: susubo64: ; GFX1010: ; %bb.0: -; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_sub_u32 s6, s4, s6 @@ -1603,7 +1593,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W32-LABEL: susubo64: ; GFX1030W32: ; %bb.0: -; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6 @@ -1618,7 +1608,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1030W64-LABEL: susubo64: ; GFX1030W64: ; %bb.0: -; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6 @@ -1633,7 +1623,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX11-LABEL: susubo64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s6, s4, s6 ; GFX11-NEXT: s_subb_u32 s7, s5, s7 @@ -1664,31 +1654,31 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { ; CISI-LABEL: vusubo64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; CISI-NEXT: s_mov_b32 s11, 0xf000 -; CISI-NEXT: s_mov_b32 s10, -1 -; CISI-NEXT: s_mov_b32 s2, s10 +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CISI-NEXT: s_mov_b32 s3, 0xf000 +; CISI-NEXT: s_mov_b32 s2, -1 ; CISI-NEXT: s_waitcnt lgkmcnt(0) -; CISI-NEXT: s_mov_b32 s8, s4 -; CISI-NEXT: v_mov_b32_e32 v1, s13 -; CISI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 +; CISI-NEXT: s_mov_b32 s0, s4 +; CISI-NEXT: v_mov_b32_e32 v1, s9 +; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; CISI-NEXT: s_mov_b32 s9, s5 -; CISI-NEXT: s_mov_b32 s0, s6 -; CISI-NEXT: s_mov_b32 s1, s7 -; CISI-NEXT: s_mov_b32 s3, s11 -; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; CISI-NEXT: s_mov_b32 s1, s5 +; CISI-NEXT: s_mov_b32 s4, s6 +; CISI-NEXT: s_mov_b32 s5, s7 +; CISI-NEXT: s_mov_b32 s6, s2 +; CISI-NEXT: s_mov_b32 s7, s3 +; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CISI-NEXT: s_waitcnt expcnt(0) ; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; CISI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CISI-NEXT: s_endpgm ; ; VI-LABEL: vusubo64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v6, s1 @@ -1705,14 +1695,14 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX9-LABEL: vusubo64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v2, v0, s[6:7] @@ -1721,13 +1711,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1010-LABEL: vusubo64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: v_sub_co_u32 v0, s2, s0, v0 -; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s1, 0, s2 -; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX1010-NEXT: v_sub_co_u32 v0, s0, s2, v0 +; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1010-NEXT: global_store_byte v2, v3, s[6:7] @@ -1736,8 +1726,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W32-LABEL: vusubo64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0 @@ -1751,8 +1741,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1030W64-LABEL: vusubo64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0 @@ -1766,15 +1756,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-LABEL: vusubo64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX11-NEXT: s_clause 0x1 @@ -1804,8 +1792,8 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-LABEL: sudiv64: ; CISI: ; %bb.0: -; CISI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CISI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; CISI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CISI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; CISI-NEXT: s_waitcnt lgkmcnt(0) ; CISI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; CISI-NEXT: s_mov_b32 s0, 0 @@ -1955,8 +1943,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; VI-LABEL: sudiv64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; VI-NEXT: s_mov_b32 s0, 0 @@ -2112,18 +2100,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GFX9-LABEL: sudiv64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[8:9] +; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_u32 s0, 0, s8 -; GFX9-NEXT: s_subb_u32 s1, 0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_sub_u32 s0, 0, s2 +; GFX9-NEXT: s_subb_u32 s1, 0, s3 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2196,24 +2184,24 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_mul_i32 s0, s7, s0 ; GFX9-NEXT: s_add_u32 s11, s1, s0 ; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s0, s8, s10 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s11 +; GFX9-NEXT: s_mul_i32 s0, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s11 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s11 +; GFX9-NEXT: s_mul_i32 s1, s3, s11 ; GFX9-NEXT: s_add_i32 s12, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s11 +; GFX9-NEXT: s_mul_i32 s1, s2, s11 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_sub_i32 s0, s7, s12 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s13, s0, s9 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s8, v0 +; GFX9-NEXT: s_subb_u32 s13, s0, s3 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s9 +; GFX9-NEXT: s_cmp_ge_u32 s13, s3 ; GFX9-NEXT: s_cselect_b32 s14, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s13, s9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 +; GFX9-NEXT: s_cmp_eq_u32 s13, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -2231,10 +2219,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: s_subb_u32 s0, s7, s12 -; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cmp_ge_u32 s0, s3 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX9-NEXT: s_cmp_eq_u32 s0, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2246,27 +2234,27 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_sub_i32 s0, 0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s0, 0, s2 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s0, s2, s0 -; GFX9-NEXT: s_add_i32 s2, s2, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 -; GFX9-NEXT: s_mul_i32 s3, s0, s8 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_sub_i32 s6, s3, s8 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s0, s3, s0 +; GFX9-NEXT: s_add_i32 s3, s3, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s3 +; GFX9-NEXT: s_mul_i32 s7, s0, s2 +; GFX9-NEXT: s_sub_i32 s6, s6, s7 +; GFX9-NEXT: s_add_i32 s3, s0, 1 +; GFX9-NEXT: s_sub_i32 s7, s6, s2 +; GFX9-NEXT: s_cmp_ge_u32 s6, s2 +; GFX9-NEXT: s_cselect_b32 s0, s3, s0 +; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s3, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s2 +; GFX9-NEXT: s_cselect_b32 s0, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: .LBB16_3: @@ -2280,18 +2268,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-LABEL: sudiv64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_or_b64 s[2:3], s[6:7], s[8:9] -; GFX1010-NEXT: s_mov_b32 s2, 0 -; GFX1010-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1010-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GFX1010-NEXT: s_mov_b32 s8, 0 +; GFX1010-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1010-NEXT: ; %bb.1: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX1010-NEXT: s_sub_u32 s3, 0, s8 -; GFX1010-NEXT: s_subb_u32 s10, 0, s9 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX1010-NEXT: s_sub_u32 s9, 0, s2 +; GFX1010-NEXT: s_subb_u32 s10, 0, s3 ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1010-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2302,11 +2290,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1010-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s3, s0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s1 +; GFX1010-NEXT: s_mul_i32 s11, s9, s0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s1 ; GFX1010-NEXT: s_mul_i32 s12, s10, s1 ; GFX1010-NEXT: s_add_i32 s11, s13, s11 -; GFX1010-NEXT: s_mul_i32 s14, s3, s1 +; GFX1010-NEXT: s_mul_i32 s14, s9, s1 ; GFX1010-NEXT: s_add_i32 s11, s11, s12 ; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14 ; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14 @@ -2326,76 +2314,76 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1010-NEXT: s_addc_u32 s0, s0, s11 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s11, s3, s0 -; GFX1010-NEXT: s_mul_hi_u32 s12, s3, s1 +; GFX1010-NEXT: s_mul_i32 s11, s9, s0 +; GFX1010-NEXT: s_mul_hi_u32 s12, s9, s1 ; GFX1010-NEXT: s_mul_i32 s10, s10, s1 ; GFX1010-NEXT: s_add_i32 s11, s12, s11 -; GFX1010-NEXT: s_mul_i32 s3, s3, s1 +; GFX1010-NEXT: s_mul_i32 s9, s9, s1 ; GFX1010-NEXT: s_add_i32 s11, s11, s10 -; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s3 -; GFX1010-NEXT: s_mul_i32 s13, s0, s3 -; GFX1010-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s9 +; GFX1010-NEXT: s_mul_i32 s13, s0, s9 +; GFX1010-NEXT: s_mul_hi_u32 s9, s1, s9 ; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s11 ; GFX1010-NEXT: s_mul_i32 s1, s1, s11 ; GFX1010-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1010-NEXT: s_add_u32 s1, s3, s1 -; GFX1010-NEXT: s_addc_u32 s3, 0, s14 +; GFX1010-NEXT: s_add_u32 s1, s9, s1 +; GFX1010-NEXT: s_addc_u32 s9, 0, s14 ; GFX1010-NEXT: s_add_u32 s1, s1, s13 ; GFX1010-NEXT: s_mul_i32 s11, s0, s11 -; GFX1010-NEXT: s_addc_u32 s1, s3, s12 -; GFX1010-NEXT: s_addc_u32 s3, s10, 0 +; GFX1010-NEXT: s_addc_u32 s1, s9, s12 +; GFX1010-NEXT: s_addc_u32 s9, s10, 0 ; GFX1010-NEXT: s_add_u32 s1, s1, s11 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 +; GFX1010-NEXT: s_addc_u32 s9, 0, s9 ; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 ; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1010-NEXT: s_addc_u32 s0, s0, s3 +; GFX1010-NEXT: s_addc_u32 s0, s0, s9 ; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 ; GFX1010-NEXT: s_mul_i32 s10, s6, s0 -; GFX1010-NEXT: s_mul_hi_u32 s3, s6, s0 +; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s0 ; GFX1010-NEXT: s_mul_hi_u32 s11, s7, s0 ; GFX1010-NEXT: s_mul_i32 s0, s7, s0 ; GFX1010-NEXT: s_mul_hi_u32 s12, s6, s1 ; GFX1010-NEXT: s_mul_hi_u32 s13, s7, s1 ; GFX1010-NEXT: s_mul_i32 s1, s7, s1 ; GFX1010-NEXT: s_add_u32 s10, s12, s10 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 +; GFX1010-NEXT: s_addc_u32 s9, 0, s9 ; GFX1010-NEXT: s_add_u32 s1, s10, s1 -; GFX1010-NEXT: s_addc_u32 s1, s3, s13 -; GFX1010-NEXT: s_addc_u32 s3, s11, 0 +; GFX1010-NEXT: s_addc_u32 s1, s9, s13 +; GFX1010-NEXT: s_addc_u32 s9, s11, 0 ; GFX1010-NEXT: s_add_u32 s1, s1, s0 -; GFX1010-NEXT: s_addc_u32 s3, 0, s3 -; GFX1010-NEXT: s_mul_hi_u32 s0, s8, s1 -; GFX1010-NEXT: s_mul_i32 s11, s8, s3 -; GFX1010-NEXT: s_mul_i32 s12, s8, s1 +; GFX1010-NEXT: s_addc_u32 s9, 0, s9 +; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1 +; GFX1010-NEXT: s_mul_i32 s11, s2, s9 +; GFX1010-NEXT: s_mul_i32 s12, s2, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s11 ; GFX1010-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1010-NEXT: s_mul_i32 s10, s9, s1 +; GFX1010-NEXT: s_mul_i32 s10, s3, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s10 -; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s8 +; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2 ; GFX1010-NEXT: s_sub_i32 s10, s7, s0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_subb_u32 s10, s10, s9 +; GFX1010-NEXT: s_subb_u32 s10, s10, s3 ; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v1 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 ; GFX1010-NEXT: s_subb_u32 s10, s10, 0 -; GFX1010-NEXT: s_cmp_ge_u32 s10, s9 +; GFX1010-NEXT: s_cmp_ge_u32 s10, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s10, s9 +; GFX1010-NEXT: s_cmp_eq_u32 s10, s3 ; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1010-NEXT: s_add_u32 s10, s1, 1 ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: s_addc_u32 s12, s3, 0 +; GFX1010-NEXT: s_addc_u32 s12, s9, 0 ; GFX1010-NEXT: s_add_u32 s13, s1, 2 -; GFX1010-NEXT: s_addc_u32 s14, s3, 0 +; GFX1010-NEXT: s_addc_u32 s14, s9, 0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0 +; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 ; GFX1010-NEXT: s_subb_u32 s0, s7, s0 ; GFX1010-NEXT: v_mov_b32_e32 v2, s13 -; GFX1010-NEXT: s_cmp_ge_u32 s0, s9 +; GFX1010-NEXT: s_cmp_ge_u32 s0, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s0, s9 +; GFX1010-NEXT: s_cmp_eq_u32 s0, s3 ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s14 @@ -2403,13 +2391,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s2 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX1010-NEXT: s_sub_i32 s1, 0, s8 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1010-NEXT: s_sub_i32 s1, 0, s2 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2418,17 +2406,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX1010-NEXT: s_add_i32 s0, s0, s1 ; GFX1010-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX1010-NEXT: s_mul_i32 s1, s0, s8 -; GFX1010-NEXT: s_add_i32 s2, s0, 1 +; GFX1010-NEXT: s_mul_i32 s1, s0, s2 +; GFX1010-NEXT: s_add_i32 s3, s0, 1 ; GFX1010-NEXT: s_sub_i32 s1, s6, s1 -; GFX1010-NEXT: s_sub_i32 s3, s1, s8 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 -; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 -; GFX1010-NEXT: s_cselect_b32 s1, s3, s1 -; GFX1010-NEXT: s_add_i32 s2, s0, 1 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s8 +; GFX1010-NEXT: s_sub_i32 s6, s1, s2 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 +; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 +; GFX1010-NEXT: s_cselect_b32 s1, s6, s1 +; GFX1010-NEXT: s_add_i32 s3, s0, 1 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 ; GFX1010-NEXT: s_mov_b32 s1, 0 -; GFX1010-NEXT: s_cselect_b32 s0, s2, s0 +; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 ; GFX1010-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-NEXT: .LBB16_3: @@ -2442,8 +2430,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-LABEL: sudiv64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GFX1030W32-NEXT: s_mov_b32 s8, 0 @@ -2604,8 +2592,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-LABEL: sudiv64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] ; GFX1030W64-NEXT: s_mov_b32 s0, 0 @@ -2765,8 +2753,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-LABEL: sudiv64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GFX11-NEXT: s_mov_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index 8a39a52cd25eab..8e773cad3b3357 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -27,7 +27,7 @@ entry: define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; ; GFX900-LABEL: test_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { ; GFX1010-LABEL: test_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 @@ -266,7 +266,7 @@ entry: define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 @@ -276,7 +276,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; ; GFX900-LABEL: test_force_fp_kern_stack: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 @@ -287,7 +287,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { ; GFX1010-LABEL: test_force_fp_kern_stack: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 @@ -509,7 +509,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; GFX803-LABEL: test_sgpr_offset_kernel: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_u32 s0, s0, s15 +; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) @@ -525,7 +525,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX900-LABEL: test_sgpr_offset_kernel: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -541,7 +541,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { ; ; GFX1010-LABEL: test_sgpr_offset_kernel: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s0, s0, s15 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_mov_b32 s4, 0x20000 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 3e25904aa044dd..2d3d7fafa72c0b 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -5,12 +5,12 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: test_loop: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0xa +; GCN-NEXT: s_load_dword s2, s[0:1], 0xa ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, -1 +; GCN-NEXT: s_cmp_eq_u32 s2, -1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -108,7 +108,7 @@ for.body: define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_true: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, -1 @@ -199,7 +199,7 @@ for.body: define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_false: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -283,7 +283,7 @@ for.body: define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-LABEL: loop_const_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 m0, -1 @@ -368,7 +368,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v0, v0 -; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_bitcmp1_b32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index b23249570faa7d..1588dde19cfb78 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 21e2a85ab18d98..da609bfa8edea6 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index 237e06def15763..12ef7657b19130 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -119,7 +119,7 @@ ret: ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[2:3], 0x2c +; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c ; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc{{[0-1]}} diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index ea10547da6ab7f..397efb126053fc 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -425,9 +425,9 @@ bb: define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 { ; GFX900-LABEL: vload2_private: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s15 +; GFX900-NEXT: s_add_u32 s0, s0, s9 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] @@ -456,10 +456,10 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR-LABEL: vload2_private: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; FLATSCR-NEXT: s_mov_b32 s4, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] @@ -483,9 +483,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX10_DEFAULT-LABEL: vload2_private: ; GFX10_DEFAULT: ; %bb.0: ; %entry -; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 -; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s15 +; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s9 ; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0 ; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] @@ -514,11 +514,11 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; FLATSCR_GFX10-LABEL: vload2_private: ; FLATSCR_GFX10: ; %bb.0: ; %entry -; FLATSCR_GFX10-NEXT: s_add_u32 s6, s6, s11 -; FLATSCR_GFX10-NEXT: s_addc_u32 s7, s7, 0 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; FLATSCR_GFX10-NEXT: s_add_u32 s2, s2, s5 +; FLATSCR_GFX10-NEXT: s_addc_u32 s3, s3, 0 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR_GFX10-NEXT: s_mov_b32 s4, 0 ; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -545,7 +545,7 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; ; GFX11-LABEL: vload2_private: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index e1717a816de0d2..84bd9b6f6c5d48 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -40,7 +40,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -51,9 +51,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_add_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -77,7 +75,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_multi_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -97,7 +95,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -117,7 +115,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -131,14 +129,13 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_multi_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -161,7 +158,7 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_dbg_use_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -177,7 +174,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_dbg_use_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -194,7 +191,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_dbg_use_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -205,9 +202,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_dbg_use_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -232,7 +227,7 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_neg_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -249,7 +244,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_add_neg_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -267,7 +262,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_add_neg_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -279,14 +274,13 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_add_neg_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_floor_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -307,7 +301,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_non_clamp_max_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -324,7 +318,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_non_clamp_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -342,7 +336,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_non_clamp_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -354,14 +348,13 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_non_clamp_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -380,7 +373,7 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-LABEL: v_clamp_add_src_f32_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -396,7 +389,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_add_src_f32_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -413,7 +406,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_add_src_f32_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -424,9 +417,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_add_src_f32_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -450,7 +441,7 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -468,7 +459,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_add_src_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -485,7 +476,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_add_src_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -496,9 +487,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_add_src_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -522,7 +511,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -540,7 +529,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -557,7 +546,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -568,9 +557,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_clamp_add_src_f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -594,7 +581,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -611,7 +598,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_add_src_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -629,7 +616,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_add_src_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -641,9 +628,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_add_src_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -668,7 +653,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -684,7 +669,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_add_src_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -701,7 +686,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_add_src_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -712,9 +697,7 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_add_src_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -738,26 +721,26 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspace(1) %aptr, float %a) #0 { ; SI-LABEL: v_clamp_mac_to_mad: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s0, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mad_f32 v3, s0, s0, v2 clamp +; SI-NEXT: v_mad_f32 v3, s8, s8, v2 clamp ; SI-NEXT: v_add_f32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: v_clamp_mac_to_mad: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -775,31 +758,28 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_mac_to_mad: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_f32 v2, s0, s0, v1 clamp +; GFX9-NEXT: v_mad_f32 v2, s2, s2, v1 clamp ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_clamp_mac_to_mad: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: v_mul_f32_e64 v2, s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e64 v2, v2, v1 clamp -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -822,7 +802,7 @@ define amdgpu_kernel void @v_clamp_mac_to_mad(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -846,7 +826,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -866,7 +846,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -877,9 +857,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -903,7 +881,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; SI-LABEL: v_clamp_add_src_v2f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -927,7 +905,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -947,7 +925,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -958,9 +936,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -984,7 +960,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1016,7 +992,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1038,7 +1014,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1050,14 +1026,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1078,7 +1053,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1103,7 +1078,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1124,7 +1099,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1136,14 +1111,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1166,7 +1140,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1191,7 +1165,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1212,7 +1186,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1224,14 +1198,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1254,7 +1227,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1) define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1278,7 +1251,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1298,7 +1271,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1310,14 +1283,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o ; ; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1339,7 +1311,7 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1362,7 +1334,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1382,7 +1354,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1394,14 +1366,13 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1423,7 +1394,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_packed_src_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1448,7 +1419,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_no_clamp_add_packed_src_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1469,7 +1440,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_no_clamp_add_packed_src_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1481,14 +1452,13 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_no_clamp_add_packed_src_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1511,7 +1481,7 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1535,7 +1505,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1553,7 +1523,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,9 +1536,7 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 9b6c50c10d90dd..947284506a2970 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -52,9 +52,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -67,9 +65,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -93,7 +89,7 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -109,7 +105,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -126,7 +122,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -137,9 +133,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -152,9 +146,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -179,7 +171,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -195,7 +187,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -212,7 +204,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -223,9 +215,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -238,9 +228,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -267,7 +255,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -285,7 +273,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -304,7 +292,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -317,14 +305,13 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -333,14 +320,13 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -363,7 +349,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -381,7 +367,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +386,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -413,14 +399,13 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -429,14 +414,13 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -456,7 +440,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_multi_use_max_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -477,7 +461,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_multi_use_max_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -498,7 +482,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_multi_use_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -513,16 +497,14 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_multi_use_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc @@ -533,16 +515,14 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_multi_use_max_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1 ; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -566,7 +546,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -583,7 +563,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -600,7 +580,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -611,9 +591,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -626,9 +604,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -652,7 +628,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -669,7 +645,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -686,7 +662,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -697,9 +673,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -712,9 +686,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -739,7 +711,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -756,7 +728,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -773,7 +745,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -784,9 +756,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -799,9 +769,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -828,7 +796,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -844,7 +812,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -861,7 +829,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -872,9 +840,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -887,9 +853,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -913,7 +877,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -929,7 +893,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_clamp_neg_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -946,7 +910,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_clamp_neg_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -957,9 +921,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_clamp_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -972,9 +934,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: v_clamp_neg_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -999,7 +959,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1015,7 +975,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: v_clamp_negabs_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1032,7 +992,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_clamp_negabs_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1043,9 +1003,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX11-LABEL: v_clamp_negabs_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -1058,9 +1016,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: v_clamp_negabs_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -1087,7 +1043,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1104,7 +1060,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX8-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1122,7 +1078,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1134,9 +1090,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1149,9 +1103,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: v_clamp_med3_aby_negzero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1173,7 +1125,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_aby_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1189,7 +1141,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_aby_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1206,7 +1158,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_aby_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1217,9 +1169,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_aby_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1232,9 +1182,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_aby_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1256,7 +1204,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bay_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1272,7 +1220,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bay_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1289,7 +1237,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bay_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1300,9 +1248,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_bay_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1315,9 +1261,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bay_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1339,7 +1283,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yab_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1355,7 +1299,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yab_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1372,7 +1316,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yab_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1383,9 +1327,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_yab_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1398,9 +1340,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yab_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1422,7 +1362,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_yba_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1438,7 +1378,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_yba_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1455,7 +1395,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_yba_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1466,9 +1406,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_yba_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1481,9 +1419,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_yba_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1505,7 +1441,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_ayb_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1521,7 +1457,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_ayb_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1538,7 +1474,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_ayb_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1549,9 +1485,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_ayb_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1564,9 +1498,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_ayb_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1588,7 +1520,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_med3_bya_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1604,7 +1536,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_med3_bya_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1621,7 +1553,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_med3_bya_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1632,9 +1564,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_med3_bya_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1647,9 +1577,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_med3_bya_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1671,7 +1599,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_one_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1683,7 +1611,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX8-LABEL: v_clamp_constants_to_one_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1695,7 +1623,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX9-LABEL: v_clamp_constants_to_one_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1704,10 +1632,8 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX11-LABEL: v_clamp_constants_to_one_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1716,10 +1642,8 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # ; ; GFX12-LABEL: v_clamp_constants_to_one_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1735,7 +1659,7 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) # define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constants_to_zero_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1746,7 +1670,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constants_to_zero_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1758,7 +1682,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constants_to_zero_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1767,10 +1691,8 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constants_to_zero_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1779,10 +1701,8 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constants_to_zero_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1798,7 +1718,7 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1810,7 +1730,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_clamp_constant_preserve_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0.5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,7 +1742,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_clamp_constant_preserve_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1831,10 +1751,8 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_clamp_constant_preserve_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1843,10 +1761,8 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_clamp_constant_preserve_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1862,7 +1778,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out) define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1874,7 +1790,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1886,7 +1802,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1895,9 +1811,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1907,9 +1821,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1926,7 +1838,7 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1) define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_qnan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1937,7 +1849,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_qnan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1949,7 +1861,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1958,10 +1870,8 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1970,10 +1880,8 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -1989,7 +1897,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; GFX6-LABEL: v_clamp_constant_snan_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2000,7 +1908,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX8-LABEL: v_clamp_constant_snan_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2012,7 +1920,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: v_clamp_constant_snan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2021,10 +1929,8 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: v_clamp_constant_snan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2033,10 +1939,8 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { ; ; GFX12-LABEL: v_clamp_constant_snan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2056,7 +1960,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2073,7 +1977,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2091,7 +1995,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2103,14 +2007,13 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2119,9 +2022,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2146,7 +2047,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #3 { ; GFX6-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2162,7 +2063,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2179,7 +2080,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2190,9 +2091,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2205,9 +2104,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: v_clamp_f32_snan_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2232,7 +2129,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2249,7 +2146,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2267,7 +2164,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2279,14 +2176,13 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2295,9 +2191,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2321,7 +2215,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out, define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2338,7 +2232,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2356,7 +2250,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2368,14 +2262,13 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2384,9 +2277,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( ; ; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2411,7 +2302,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace( define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2427,7 +2318,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2444,7 +2335,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2455,9 +2346,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2470,9 +2359,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2494,7 +2381,7 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2510,7 +2397,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2527,7 +2414,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2538,9 +2425,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2553,9 +2438,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2577,7 +2460,7 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2593,7 +2476,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2610,7 +2493,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2621,9 +2504,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2636,9 +2517,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2660,7 +2539,7 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2676,7 +2555,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2693,7 +2572,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2704,9 +2583,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2719,9 +2596,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2743,7 +2618,7 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2759,7 +2634,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2776,7 +2651,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2787,9 +2662,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2802,9 +2675,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2826,7 +2697,7 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2842,7 +2713,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2859,7 +2730,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2870,9 +2741,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2885,9 +2754,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % ; ; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2909,7 +2776,7 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) % define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2921,7 +2788,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2933,7 +2800,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2942,9 +2809,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2954,10 +2819,8 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -2973,7 +2836,7 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace(1) %out) #2 { ; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2985,7 +2848,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2997,7 +2860,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3006,9 +2869,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3018,10 +2879,8 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace ; ; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3037,7 +2896,7 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3059,7 +2918,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_clamp_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3078,7 +2937,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_clamp_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3089,9 +2948,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_clamp_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3104,9 +2961,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: v_clamp_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3130,7 +2985,7 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_elt: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3156,7 +3011,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_v2f16_undef_elt: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3180,7 +3035,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_v2f16_undef_elt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3191,9 +3046,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_v2f16_undef_elt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3206,9 +3059,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_v2f16_undef_elt: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3232,7 +3083,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_zero: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3256,7 +3107,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: v_clamp_v2f16_not_zero: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3277,7 +3128,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_clamp_v2f16_not_zero: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3290,16 +3141,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_clamp_v2f16_not_zero: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3308,16 +3157,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; ; GFX12-LABEL: v_clamp_v2f16_not_zero: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3337,7 +3184,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_not_one: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3360,7 +3207,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_not_one: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3381,7 +3228,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_not_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3394,16 +3241,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_v2f16_not_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3412,16 +3257,14 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_not_one: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3441,7 +3284,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neg_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3464,7 +3307,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_clamp_neg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3483,7 +3326,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_clamp_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3494,9 +3337,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_clamp_neg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3509,9 +3350,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: v_clamp_neg_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3536,7 +3375,7 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_negabs_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3559,7 +3398,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: v_clamp_negabs_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3578,7 +3417,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: v_clamp_negabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3590,14 +3429,13 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: v_clamp_negabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3606,14 +3444,13 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: v_clamp_negabs_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -3636,7 +3473,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neglo_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3659,7 +3496,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neglo_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3678,7 +3515,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neglo_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3689,9 +3526,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_neglo_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3704,9 +3539,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neglo_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3733,7 +3566,7 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_neghi_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3755,7 +3588,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: v_clamp_neghi_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3774,7 +3607,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_clamp_neghi_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3785,9 +3618,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_clamp_neghi_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3800,9 +3631,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: v_clamp_neghi_v2f16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3829,7 +3658,7 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_shuffle: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3851,7 +3680,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: v_clamp_v2f16_shuffle: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3870,7 +3699,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_clamp_v2f16_shuffle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3881,9 +3710,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: v_clamp_v2f16_shuffle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3896,9 +3723,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: v_clamp_v2f16_shuffle: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3923,7 +3748,7 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3949,7 +3774,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3973,7 +3798,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3984,9 +3809,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3999,9 +3822,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4025,7 +3846,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4051,7 +3872,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4075,7 +3896,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -4086,9 +3907,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4101,9 +3920,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; ; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4127,7 +3944,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 ; GFX6-LABEL: v_clamp_diff_source_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_load_dword s2, s[2:3], 0x2 @@ -4144,7 +3961,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_clamp_diff_source_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8 @@ -4163,7 +3980,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_clamp_diff_source_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 @@ -4179,7 +3996,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_clamp_diff_source_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4197,7 +4014,7 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_clamp_diff_source_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index fad1d47f55fd79..b6948dab6bf9f2 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -30,7 +30,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -49,7 +49,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 @@ -96,7 +96,7 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 @@ -155,7 +155,7 @@ bb: define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -175,7 +175,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 @@ -223,7 +223,7 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll index dcd088e2bd9886..9edf5663359254 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll @@ -9,7 +9,7 @@ ; GCN-NEXT: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} ; GCN-NEXT: global_store_dwordx2 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+\]}}, s[{{[0-9:]+}}] -define amdgpu_kernel void @test_odd_int4(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { +define amdgpu_kernel void @test_odd_int4(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i32 %lid @@ -24,7 +24,7 @@ bb: ; GCN: global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}], ; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} ; GCN: global_store_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v[{{[0-9]*[02468]:[0-9]*[13579]}}] -define amdgpu_kernel void @test_vector_creation() #0 { +define amdgpu_kernel void @test_vector_creation() { entry: %tmp231 = load <4 x i16>, ptr addrspace(1) undef, align 2 %vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> @@ -35,5 +35,3 @@ entry: } declare i32 @llvm.amdgcn.workitem.id.x() - -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index 3035a8579c8a6d..9321bc262c4a49 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -12,7 +12,7 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 6 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 @@ -31,7 +31,7 @@ ; OSABI-AMDHSA-ASM: .section .rodata,"a" ; OSABI-AMDHSA-ASM: .p2align 6 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub -; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10 +; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 6 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index a75c7c2ead8823..c724e73642adee 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -175,7 +175,7 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_4 ; GCN-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -367,7 +367,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_if_else: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -595,7 +595,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: nested_if_else_if: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -884,10 +884,10 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-LABEL: s_endpgm_unsafe_barrier: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %bb.then -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -895,7 +895,7 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v1, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: .LBB4_2: ; %bb.end -; GCN-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index df223b3ec1354d..33c0d90f94a397 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -88,7 +88,7 @@ bb: define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: sub1: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: sub1: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -127,33 +127,33 @@ bb: define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: add_adde: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v4, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: add_adde: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v3, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -171,33 +171,33 @@ bb: define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: adde_add: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: adde_add: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -215,33 +215,33 @@ bb: define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -259,35 +259,35 @@ bb: define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_sube_commuted: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sube_commuted: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -306,33 +306,33 @@ bb: define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sube_sub: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sube_sub: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -350,33 +350,33 @@ bb: define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: zext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: zext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -392,33 +392,33 @@ bb: define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float %x) { ; GCN-LABEL: sext_flclass: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v3, 0x260 -; GCN-NEXT: v_cmp_class_f32_e32 vcc, s4, v3 +; GCN-NEXT: v_cmp_class_f32_e32 vcc, s0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sext_flclass: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -434,7 +434,7 @@ bb: define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: add_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: add_and: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_max_u32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 @@ -478,7 +478,7 @@ bb: define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_sext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -493,7 +493,7 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_sext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -518,7 +518,7 @@ bb: define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: cmp_sub_zext: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -533,7 +533,7 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) { ; ; GFX9-LABEL: cmp_sub_zext: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -557,33 +557,33 @@ bb: define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_addcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_addcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -601,33 +601,33 @@ bb: define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) { ; GCN-LABEL: sub_subcarry: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v5, s0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_subb_u32_e32 v0, vcc, v4, v5, vcc -; GCN-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_subcarry: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -646,7 +646,7 @@ bb: define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -665,7 +665,7 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -695,7 +695,7 @@ bb: define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 @@ -714,7 +714,7 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index 5fbcd0bf669995..3a7100c5903ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,12 +5,12 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x2 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.then -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; CHECK-NEXT: s_and_b32 s4, s0, 0xffff ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mul_i32 s6, s4, 0xaaab diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll index 48bd8f9b80799b..c27e44609c527f 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadCombine: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) { ; GCN-LABEL: vectorLoadShuffle: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 9e5dbe91504a0c..e9dbce9026ca04 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -48,40 +48,41 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -103,7 +104,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -128,7 +129,7 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa ; ; VI-LABEL: test_copy_v4i8_x3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -162,51 +163,51 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s18, s6 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s19, s7 -; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_mov_b32 s23, s7 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 -; SI-NEXT: s_mov_b32 s16, s12 -; SI-NEXT: s_mov_b32 s17, s13 -; SI-NEXT: s_mov_b32 s20, s14 -; SI-NEXT: s_mov_b32 s21, s15 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s16, s8 +; SI-NEXT: s_mov_b32 s17, s9 +; SI-NEXT: s_mov_b32 s20, s10 +; SI-NEXT: s_mov_b32 s21, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 @@ -240,22 +241,23 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 @@ -271,23 +273,23 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -324,7 +326,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -363,7 +365,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 @@ -411,7 +413,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -431,7 +433,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -455,7 +457,7 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -475,7 +477,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -500,7 +502,7 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v3i8_align1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -523,7 +525,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: test_copy_v3i8_align1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -550,7 +552,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -567,7 +569,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p ; ; VI-LABEL: test_copy_v4i8_volatile_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -589,7 +591,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_volatile_store: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -616,7 +618,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ; ; VI-LABEL: test_copy_v4i8_volatile_store: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll index 95d28c9749522d..a0e76f9a47a8a4 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll @@ -6,44 +6,44 @@ define protected amdgpu_kernel void @sccClobber(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %e, ptr addrspace(1) %f, ptr addrspace(1) %pout.coerce) { ; RRLIST-LABEL: sccClobber: ; RRLIST: ; %bb.0: ; %entry -; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 +; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_min_i32 s4, s16, 0 -; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; RRLIST-NEXT: s_and_b64 s[2:3], vcc, exec -; RRLIST-NEXT: s_cselect_b32 s2, s16, s17 -; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] -; RRLIST-NEXT: s_cselect_b32 s0, s4, s2 +; RRLIST-NEXT: s_and_b64 s[0:1], vcc, exec +; RRLIST-NEXT: s_cselect_b32 s0, s16, s17 +; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] +; RRLIST-NEXT: s_cselect_b32 s0, s4, s0 ; RRLIST-NEXT: v_mov_b32_e32 v0, s0 ; RRLIST-NEXT: global_store_dword v2, v0, s[14:15] ; RRLIST-NEXT: s_endpgm ; ; FAST-LABEL: sccClobber: ; FAST: ; %bb.0: ; %entry -; FAST-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; FAST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_load_dword s16, s[8:9], 0x0 -; FAST-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; FAST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; FAST-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x44 +; FAST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; FAST-NEXT: s_load_dword s17, s[10:11], 0x0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_min_i32 s4, s16, 0 -; FAST-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; FAST-NEXT: s_and_b64 s[2:3], vcc, exec -; FAST-NEXT: s_cselect_b32 s2, s16, s17 -; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[0:1] -; FAST-NEXT: s_cselect_b32 s0, s4, s2 +; FAST-NEXT: s_and_b64 s[0:1], vcc, exec +; FAST-NEXT: s_cselect_b32 s0, s16, s17 +; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] +; FAST-NEXT: s_cselect_b32 s0, s4, s0 ; FAST-NEXT: v_mov_b32_e32 v0, s0 ; FAST-NEXT: global_store_dword v2, v0, s[14:15] ; FAST-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll index c57ee9cc6a1e2d..7dd95a02f136b7 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(4) %addrSrc) { ; GCN-LABEL: copy_to_scc: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252 ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 63b9d68123fa41..4decf39d040134 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -23,11 +23,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s2, s4 +; SI-NEXT: s_flbit_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -36,8 +36,8 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -62,36 +62,36 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_ctlz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b32 s2, s4 -; GFX10-NEXT: s_min_u32 s2, s2, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_flbit_i32_b32 s0, s4 +; GFX10-NEXT: s_min_u32 s0, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u32 s2, s4 +; GFX11-NEXT: s_clz_i32_u32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -107,7 +107,7 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -127,7 +127,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -164,7 +164,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -190,14 +190,13 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -214,7 +213,7 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -236,7 +235,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -278,7 +277,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -293,7 +292,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -308,11 +307,9 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -336,7 +333,7 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -362,7 +359,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -414,7 +411,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -433,7 +430,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -452,11 +449,9 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -485,7 +480,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -505,7 +500,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -555,7 +550,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -568,7 +563,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -581,7 +576,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] @@ -603,8 +598,8 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -617,8 +612,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -650,11 +645,11 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_ctlz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s0, s[0:1] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -663,12 +658,12 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_ctlz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -678,14 +673,14 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX11-LABEL: s_ctlz_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u64 s0, s[0:1] +; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_u32 s0, s0, 64 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: global_store_b64 v1, v[0:1], s[2:3] +; GFX11-NEXT: s_min_u32 s2, s2, 64 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -697,7 +692,7 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -711,7 +706,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -742,7 +737,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -753,7 +748,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -764,7 +759,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -783,7 +778,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -804,7 +799,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -852,7 +847,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -867,7 +862,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -883,9 +878,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -912,7 +905,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -933,7 +926,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -981,7 +974,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -996,7 +989,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1012,20 +1005,18 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp -; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp +; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1042,7 +1033,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1061,7 +1052,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1099,7 +1090,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1111,7 +1102,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1126,10 +1117,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1151,7 +1140,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1170,7 +1159,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1208,7 +1197,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1220,7 +1209,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1235,10 +1224,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1261,7 +1248,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1283,7 +1270,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1326,7 +1313,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1341,7 +1328,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1356,16 +1343,14 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1385,7 +1370,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1407,7 +1392,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1450,7 +1435,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,7 +1450,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1480,16 +1465,14 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1509,7 +1492,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1527,7 +1510,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1569,7 +1552,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1580,7 +1563,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1600,8 +1583,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1623,7 +1606,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1641,7 +1624,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1691,7 +1674,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1706,7 +1689,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1722,7 +1705,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1750,7 +1733,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1769,7 +1752,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1812,7 +1795,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1824,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1846,14 +1829,13 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index f16f05811c185a..d269eb680138bb 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,11 +29,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,10 +41,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -64,13 +64,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, ptr addrspace(1) %out, align 4 @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -134,7 +134,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -154,7 +154,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -211,7 +211,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -232,7 +232,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -254,7 +254,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -295,7 +295,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -318,11 +318,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 24 +; SI-NEXT: s_lshl_b32 s2, s2, 24 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 24 +; VI-NEXT: s_lshl_b32 s2, s2, 24 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -373,14 +373,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 24 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 24 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i8 %val, 0 @@ -392,11 +392,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_lshl_b32 s2, s2, 16 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -405,10 +405,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -447,14 +447,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 16 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 16 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i16 %val, 0 @@ -466,11 +466,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -478,10 +478,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s2, s4 +; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -501,13 +501,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i32 %val, 0 @@ -519,7 +519,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -533,7 +533,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -561,7 +561,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -580,7 +580,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -601,7 +601,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -649,7 +649,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -672,7 +672,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -753,7 +753,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -778,7 +778,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -809,7 +809,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -869,7 +869,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -899,7 +899,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -946,7 +946,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1050,7 +1050,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1158,7 +1158,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1183,8 +1183,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,14 +1196,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1225,14 +1225,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) @@ -1243,7 +1243,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,7 +1364,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -1388,7 +1388,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1454,7 +1454,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,7 +1534,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1558,7 +1558,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1637,7 +1637,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1726,7 +1726,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1750,7 +1750,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1799,7 +1799,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1850,7 +1850,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1972,7 +1972,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1997,7 +1997,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2018,7 +2018,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2057,7 +2057,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -2082,7 +2082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2103,7 +2103,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2142,7 +2142,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -2196,11 +2196,11 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s2, s4, 14 +; SI-NEXT: s_lshl_b32 s2, s2, 14 ; SI-NEXT: s_flbit_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -2213,10 +2213,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_ctlz_zero_undef_i18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 14 +; VI-NEXT: s_lshl_b32 s2, s2, 14 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2270,18 +2270,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshl_b32 s2, s4, 14 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 -; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff -; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2 +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 14 +; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 +; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff +; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] offset:2 ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone store i18 %ctlz, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 40929d58834472..b6359f18169799 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -14,8 +14,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) ; ; VI-LABEL: s_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -142,8 +142,8 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -166,8 +166,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -239,8 +239,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -259,8 +259,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1254,8 +1254,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1315,8 +1315,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1487,8 +1487,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) { ; SI-LABEL: ctpop_i16_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: s_cmp_lg_u32 s5, 0 @@ -1517,8 +1517,8 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: s_cmp_lg_u32 s5, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 1c16612bed37fc..131ce14a7847c8 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -16,8 +16,8 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -92,8 +92,8 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind { ; SI-LABEL: v_ctpop_i64_user: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s3 @@ -115,8 +115,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctpop_i64_user: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -144,8 +144,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 ; ; VI-LABEL: s_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64 define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind { ; SI-LABEL: s_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; SI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; SI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5] -; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7] -; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9] -; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7] +; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64 define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -334,11 +334,11 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s8, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 @@ -363,11 +363,11 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s8, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s8, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 @@ -409,8 +409,8 @@ endif: define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind { ; SI-LABEL: s_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val ; ; VI-LABEL: s_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind { ; SI-LABEL: s_ctpop_i65: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,8 +460,8 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) ; ; VI-LABEL: s_ctpop_i65: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_i128: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index 02b0b1cc28fa86..ee2894a66fbfcc 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -22,11 +22,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s2, s4 +; SI-NEXT: s_ff1_i32_b32 s2, s2 ; SI-NEXT: s_min_u32 s4, s2, 32 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; ; VI-LABEL: s_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -61,27 +61,27 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n ; GFX10-LABEL: s_cttz_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b32 s2, s4 -; GFX10-NEXT: s_min_u32 s2, s2, 32 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_ff1_i32_b32 s0, s4 +; GFX10-NEXT: s_min_u32 s0, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 32 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -91,7 +91,7 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -148,7 +148,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -204,7 +204,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -310,7 +310,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -381,7 +381,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -427,7 +427,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -475,7 +475,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -487,7 +487,7 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -505,8 +505,8 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { ; SI-LABEL: s_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; ; VI-LABEL: s_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -552,11 +552,11 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-LABEL: s_cttz_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s0, s[0:1] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-NEXT: s_min_u32 s0, s0, 64 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] @@ -565,12 +565,12 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-LABEL: s_cttz_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] -; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -584,7 +584,7 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -598,7 +598,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,7 +629,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -640,7 +640,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -657,7 +657,7 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -726,7 +726,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -787,7 +787,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +835,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -850,7 +850,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -876,7 +876,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -895,7 +895,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -933,7 +933,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -945,7 +945,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -970,7 +970,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -989,7 +989,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,7 +1039,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1065,7 +1065,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1087,7 +1087,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1130,7 +1130,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1145,7 +1145,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1235,7 +1235,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1250,7 +1250,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1275,7 +1275,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1293,7 +1293,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1375,7 +1375,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1393,7 +1393,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1456,7 +1456,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1542,7 +1542,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -1554,7 +1554,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 2491abe4bc1cee..392a44318b0a5b 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -16,11 +16,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -28,10 +28,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: s_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -51,13 +51,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, ptr addrspace(1) %out, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -141,7 +141,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -219,7 +219,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -305,11 +305,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -317,10 +317,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: s_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -356,13 +356,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i8 %val, 0 @@ -374,11 +374,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,10 +386,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -425,13 +425,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i16 %val, 0 @@ -443,11 +443,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s4, s4 +; SI-NEXT: s_ff1_i32_b32 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -455,10 +455,10 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s2, s4 +; VI-NEXT: s_ff1_i32_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -478,13 +478,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b32 s0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i32 %val, 0 @@ -496,7 +496,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] @@ -538,7 +538,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -557,7 +557,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -622,7 +622,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -668,7 +668,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -745,7 +745,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -866,7 +866,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -913,7 +913,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1091,7 +1091,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1274,7 +1274,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1435,7 +1435,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1453,7 +1453,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1498,7 +1498,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1544,7 +1544,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 96969a12b2c589..3f513e120e141b 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -900,7 +900,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -918,7 +918,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -955,8 +955,8 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -976,7 +976,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -996,7 +996,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1013,7 +1013,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,11 +1039,9 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1064,7 +1062,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1086,7 +1084,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1104,7 +1102,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,10 +1130,8 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1157,7 +1153,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1179,7 +1175,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1198,7 +1194,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,11 +1224,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1259,7 +1253,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1287,7 +1281,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1318,7 +1312,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1361,10 +1355,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -1396,7 +1388,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -1434,7 +1426,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s8, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1481,7 +1473,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1529,11 +1521,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 @@ -1573,21 +1563,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1596,7 +1586,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -1609,22 +1599,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,12 +1643,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1714,13 +1704,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1767,7 +1755,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1806,7 +1794,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1851,7 +1839,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1910,11 +1898,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 @@ -1953,7 +1939,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -1980,7 +1966,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2004,7 +1990,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2044,11 +2030,9 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2077,7 +2061,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2097,7 +2081,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2114,7 +2098,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,14 +2124,13 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -2166,7 +2149,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2185,7 +2168,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2201,7 +2184,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2225,10 +2208,8 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2252,7 +2233,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s10, 0 @@ -2270,7 +2251,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2285,7 +2266,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] @@ -2307,8 +2288,8 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2329,7 +2310,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2357,7 +2338,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2388,7 +2369,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2431,10 +2412,8 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -2465,7 +2444,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2484,7 +2463,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2500,7 +2479,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2524,10 +2503,8 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2548,7 +2525,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2567,7 +2544,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2583,7 +2560,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2607,10 +2584,8 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2632,7 +2607,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2651,7 +2626,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2667,7 +2642,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2691,10 +2666,8 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2716,7 +2689,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -2735,7 +2708,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2751,7 +2724,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2775,10 +2748,8 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2800,7 +2771,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -2820,7 +2791,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -2840,7 +2811,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX10-LABEL: cvt_ubyte0_or_multiuse: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2868,17 +2839,15 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 6799980c184391..fed4b9862dbfb4 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -8,7 +8,7 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -30,7 +30,7 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -52,7 +52,7 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -96,7 +96,7 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -118,7 +118,7 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: nand: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -154,7 +154,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: max: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -198,7 +198,7 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -220,7 +220,7 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: min: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -264,7 +264,7 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -286,7 +286,7 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -308,7 +308,7 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -330,7 +330,7 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: cmpxchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 @@ -354,7 +354,7 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace( define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: xchg: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -376,7 +376,7 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -398,7 +398,7 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -420,7 +420,7 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) % define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -457,7 +457,7 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fsub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -494,7 +494,7 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 @@ -519,7 +519,7 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) { ; CHECK-LABEL: fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 @@ -544,15 +544,15 @@ define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.swap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -566,15 +566,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -588,15 +588,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -610,15 +610,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -632,15 +632,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -654,15 +654,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -676,15 +676,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -698,15 +698,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -720,15 +720,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -742,15 +742,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -764,15 +764,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.inc: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -786,15 +786,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.dec: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -808,16 +808,16 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsr define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.cmpswap: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -831,17 +831,16 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fadd: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v1, 1.0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -856,18 +855,17 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -882,18 +880,17 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rs define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index 297fe7618672e6..1e5ec361d154c5 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp0_b32 s0, 0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x48 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48 ; CHECK-NEXT: v_mov_b32_e32 v8, s10 ; CHECK-NEXT: v_mov_b32_e32 v9, s11 ; CHECK-NEXT: v_mov_b32_e32 v10, s12 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll index 8fa0068a237cd5..f414565f78f11a 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @eq_t(float %x) { ; GCN-LABEL: eq_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -21,7 +21,7 @@ define amdgpu_kernel void @eq_t(float %x) { define amdgpu_kernel void @ne_t(float %x) { ; GCN-LABEL: ne_t: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -38,7 +38,7 @@ define amdgpu_kernel void @ne_t(float %x) { define amdgpu_kernel void @eq_f(float %x) { ; GCN-LABEL: eq_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @eq_f(float %x) { define amdgpu_kernel void @ne_f(float %x) { ; GCN-LABEL: ne_f: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index f298a95c63485e..8f31bb1fe0a81c 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -7,11 +7,11 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_0_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_lshl_b32 s4, s2, 16 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -19,33 +19,33 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_0_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_0_i16: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_lshl_b32 s2, s4, 16 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_lshl_b32 s0, s4, 16 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_0_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -92,11 +92,11 @@ define i32 @divergent_vec_0_i16(i16 %a) { define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; GCN-LABEL: uniform_vec_i16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -104,33 +104,33 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) { ; ; GFX9-LABEL: uniform_vec_i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -177,11 +177,11 @@ define i32 @divergent_vec_i16_0(i16 %a) { define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; GCN-LABEL: uniform_vec_f16_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s2, 0xffff ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -189,33 +189,33 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) { ; ; GFX9-LABEL: uniform_vec_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_f16_0: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -262,7 +262,7 @@ define float @divergent_vec_f16_0(half %a) { define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_i16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -277,7 +277,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_i16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -290,7 +290,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_i16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_i16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -361,7 +361,7 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_LH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 @@ -376,7 +376,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 @@ -386,7 +386,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 @@ -396,7 +396,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_LH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -452,7 +452,7 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: uniform_vec_i16_HH: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -466,7 +466,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 @@ -476,7 +476,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 @@ -486,7 +486,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX11-LABEL: uniform_vec_i16_HH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -546,7 +546,7 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) { ; GCN-LABEL: uniform_vec_f16_LL: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 ; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -561,7 +561,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX9-LABEL: uniform_vec_f16_LL: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX906-LABEL: uniform_vec_f16_LL: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa ; ; GFX11-LABEL: uniform_vec_f16_LL: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0 @@ -684,10 +684,10 @@ entry: define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 { ; GCN-LABEL: build_vec_v2i16_undeflo_uniform: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u16 v0, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -698,35 +698,35 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ; ; GFX9-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read_u16_d16 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, s4 ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: global_store_dword v1, v0, s[2:3] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: build_vec_v2i16_undeflo_uniform: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll index d99e9699c27894..8c3155fc5c6ea8 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -25,7 +25,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -48,7 +48,7 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_i32 s2, s2, s3 @@ -69,7 +69,7 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index 5b39cc2e185b7d..243c741c17088f 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i16_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -58,9 +58,9 @@ define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i32_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s64) from %ir.x.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 @@ -110,9 +110,9 @@ define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) { define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) { ; GCN-LABEL: name: uniform_trunc_i64_to_i1 ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr2_sgpr3 + ; GCN-NEXT: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll index 75d9dd924a4d60..b0e1da3b8eecba 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds1align1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u8 v0, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds2align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds2align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds2align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_u16 v0, v0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds2align2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_u16 v0, v0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds4align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 @@ -160,7 +160,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds4align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -174,7 +174,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds4align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b32 v0, v0 @@ -190,7 +190,7 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds4align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b32 v0, v0 @@ -206,7 +206,7 @@ define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -291,7 +291,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds8align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 @@ -311,7 +311,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; ALIGNED-GISEL-LABEL: ds8align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -331,7 +331,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out ; ; UNALIGNED-LABEL: ds8align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 @@ -347,7 +347,7 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -363,7 +363,7 @@ define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds8align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 @@ -379,7 +379,7 @@ define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -473,7 +473,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -489,7 +489,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 @@ -513,7 +513,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -539,7 +539,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds12align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 @@ -555,7 +555,7 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds12align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -569,7 +569,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -583,7 +583,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -599,7 +599,7 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds12align8: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 @@ -613,7 +613,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds12align8: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -627,7 +627,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds12align8: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 @@ -641,7 +641,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds12align8: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 @@ -657,7 +657,7 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds12align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b96 v[0:2], v0 @@ -673,7 +673,7 @@ define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %o define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align1: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 @@ -716,7 +716,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align1: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align1: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -805,7 +805,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-SDAG-LABEL: ds16align2: ; ALIGNED-SDAG: ; %bb.0: -; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 @@ -835,7 +835,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; ALIGNED-GISEL-LABEL: ds16align2: ; ALIGNED-GISEL: ; %bb.0: -; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 @@ -867,7 +867,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-LABEL: ds16align2: ; UNALIGNED: ; %bb.0: -; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 @@ -883,7 +883,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; ALIGNED-LABEL: ds16align4: ; ALIGNED: ; %bb.0: -; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 @@ -897,7 +897,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-SDAG-LABEL: ds16align4: ; UNALIGNED-SDAG: ; %bb.0: -; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 @@ -911,7 +911,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou ; ; UNALIGNED-GISEL-LABEL: ds16align4: ; UNALIGNED-GISEL: ; %bb.0: -; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -927,7 +927,7 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -943,7 +943,7 @@ define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %ou define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) { ; GCN-LABEL: ds16align16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b128 v[0:3], v0 diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll index 31bbe6fbbaa143..aa1d44c31606b8 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: ds_read32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -47,7 +47,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_20: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -91,7 +91,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -136,7 +136,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 @@ -172,7 +172,7 @@ bb: } ; GCN-LABEL: ds_read32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -206,7 +206,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -246,7 +246,7 @@ bb: } ; GCN-LABEL: ds_read64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] @@ -280,7 +280,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -316,7 +316,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_400_back: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -352,7 +352,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 @@ -379,7 +379,7 @@ bb: } ; GCN-LABEL: ds_write32_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], vcc, 4, [[BASE]] @@ -406,7 +406,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_400: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] @@ -437,7 +437,7 @@ bb: } ; GCN-LABEL: ds_write64_combine_stride_8192_shifted: -; GCN: s_load_dword [[ARG:s[0-9]+]], s[6:7], 0x0 +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[BASE]], vcc, 8, [[BASE]] diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll index 7d75f1947b51af..5814b8a8ceda45 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll @@ -9,7 +9,7 @@ ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 ; GCN: s_waitcnt lgkmcnt({{[0-9]+}}) -define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -37,7 +37,7 @@ define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 -define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -67,7 +67,7 @@ define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 -define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -96,7 +96,7 @@ define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 -define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 { +define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) { %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100 %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4 @@ -115,5 +115,3 @@ define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspa store float %sum, ptr addrspace(1) %out, align 4 ret void } - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 41a9d7999e80a3..7b9b130e1cf796 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -36,9 +36,8 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { ; ; GFX11-LABEL: write_ds_sub0_offset0_global: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:12 ; GFX11-NEXT: s_endpgm @@ -54,7 +53,7 @@ entry: define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -74,7 +73,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 @@ -91,7 +90,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b @@ -107,11 +106,10 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v ; ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -137,7 +135,7 @@ entry: define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 { ; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 ; CI-NEXT: s_mov_b64 vcc, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -156,7 +154,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v4, 0 @@ -172,7 +170,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b @@ -187,7 +185,7 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy ; ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0 @@ -235,9 +233,7 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 { ; ; GFX11-LABEL: add_x_shl_max_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 ; GFX11-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -279,9 +275,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -324,9 +319,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -367,9 +361,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 ; GFX11-NEXT: ds_store_b8 v0, v1 ; GFX11-NEXT: s_endpgm @@ -414,8 +407,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 @@ -463,9 +455,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 @@ -512,10 +503,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 ; GFX11-NEXT: s_endpgm @@ -531,7 +521,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 { ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: s_mov_b64 vcc, 0 @@ -552,7 +542,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 @@ -570,7 +560,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 @@ -588,12 +578,11 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_ ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -648,10 +637,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() # ; ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index b72cd7e1d1eca4..777a8f3fef1c17 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -15,7 +15,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -51,7 +51,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -64,7 +64,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -88,7 +88,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:1028 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,7 +102,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:1028 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -126,7 +126,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -142,7 +142,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -184,7 +184,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-NEXT: s_mov_b32 s2, 0 @@ -202,7 +202,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_barrier ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -245,7 +245,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset0:2 offset1:8 ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:11 offset1:27 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -261,7 +261,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v4 offset0:2 offset1:8 ; GFX9-NEXT: ds_read2_b32 v[2:3], v4 offset0:11 offset1:27 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v3 @@ -301,7 +301,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -319,7 +319,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -352,7 +352,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -406,7 +406,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[1:2], v0 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -419,7 +419,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -449,7 +449,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -487,7 +487,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b32 v1, v0 ; CI-NEXT: ds_read_b32 v2, v0 offset:32 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +501,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ds_read_b32 v1, v0 ; GFX9-NEXT: ds_read_b32 v2, v0 offset:32 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -522,11 +522,13 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:34 ; CI-NEXT: ds_read_u8 v3, v1 offset:32 ; CI-NEXT: ds_read_u8 v4, v1 offset:3 @@ -535,13 +537,15 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: ds_read_u8 v7, v1 ; CI-NEXT: ds_read_u8 v8, v1 offset:33 ; CI-NEXT: ds_read_u8 v1, v1 offset:35 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -550,7 +554,6 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -558,8 +561,8 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -582,17 +585,17 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -612,11 +615,13 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u8 v2, v1 offset:11 ; CI-NEXT: ds_read_u8 v3, v1 offset:9 ; CI-NEXT: ds_read_u8 v4, v1 offset:8 @@ -625,13 +630,15 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_read_u8 v7, v1 offset:5 ; CI-NEXT: ds_read_u8 v8, v1 offset:10 ; CI-NEXT: ds_read_u8 v1, v1 offset:12 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_waitcnt lgkmcnt(5) ; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -640,7 +647,6 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; CI-NEXT: v_or_b32_e32 v4, v4, v6 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 ; CI-NEXT: v_add_f32_e32 v2, v4, v1 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -648,8 +654,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 @@ -672,17 +678,17 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -702,41 +708,44 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_2_simple_read2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 ; CI-NEXT: ds_read_u16 v2, v1 offset:32 ; CI-NEXT: ds_read_u16 v3, v1 offset:2 ; CI-NEXT: ds_read_u16 v4, v1 ; CI-NEXT: ds_read_u16 v1, v1 offset:34 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_or_b32_e32 v3, v3, v4 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_add_f32_e32 v2, v3, v1 -; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0 +; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s2, v0 ; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] @@ -744,12 +753,12 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ; ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1] @@ -772,7 +781,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -785,7 +794,7 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -808,7 +817,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0 @@ -821,7 +830,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read2_b64 v[0:3], v4 offset1:255 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -845,7 +854,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[1:2], v0 ; CI-NEXT: ds_read_b64 v[3:4], v0 offset:2056 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,7 +868,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:2056 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -880,15 +889,15 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_read2_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0x2 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v3, vcc, s0, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 ; CI-NEXT: ds_read2_b32 v[1:2], v3 offset1:1 ; CI-NEXT: ds_read2_b32 v[3:4], v3 offset0:14 offset1:15 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f64 v[2:3], v[1:2], v[3:4] @@ -898,13 +907,13 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: misaligned_read2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:14 offset1:15 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -929,7 +938,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -941,7 +950,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -959,7 +968,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -971,7 +980,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -991,7 +1000,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b128 v[0:3], v0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1004,7 +1013,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1026,7 +1035,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384 ; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1040,7 +1049,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384 ; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -1059,11 +1068,12 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { ; CI-LABEL: sgemm_inner_loop_read2_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_lshl_b32 s4, s6, 2 -; CI-NEXT: s_add_i32 s5, s4, 0xc20 -; CI-NEXT: s_addk_i32 s4, 0xc60 -; CI-NEXT: v_mov_b32_e32 v0, s5 -; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_lshl_b32 s0, s2, 2 +; CI-NEXT: s_add_i32 s1, s0, 0xc20 +; CI-NEXT: s_addk_i32 s0, 0xc60 +; CI-NEXT: v_mov_b32_e32 v0, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 @@ -1071,29 +1081,24 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; CI-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_waitcnt lgkmcnt(4) +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v1 -; CI-NEXT: s_waitcnt lgkmcnt(3) ; CI-NEXT: v_add_f32_e32 v0, v0, v2 ; CI-NEXT: v_add_f32_e32 v0, v0, v3 -; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_add_f32_e32 v0, v0, v4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v5 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v6 ; CI-NEXT: v_add_f32_e32 v0, v0, v7 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_add_f32_e32 v0, v0, v9 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 ; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1104,12 +1109,16 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, ; GFX9-NEXT: ds_read2_b32 v[4:5], v8 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 ; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1163,28 +1172,28 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load <2 x i32>, ptr addrspace(3) %in, align 4 store <2 x i32> %load, ptr addrspace(1) %out, align 8 @@ -1194,28 +1203,28 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; CI-LABEL: misaligned_read2_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s4, s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: misaligned_read2_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load i64, ptr addrspace(3) %in, align 4 store i64 %load, ptr addrspace(1) %out, align 8 @@ -1225,8 +1234,8 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @ds_read_diff_base_interleaving( ; CI-LABEL: ds_read_diff_base_interleaving: ; CI: ; %bb.0: ; %bb -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -1256,10 +1265,10 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; ; GFX9-LABEL: ds_read_diff_base_interleaving: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s4, v1 @@ -1461,7 +1470,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; CI-NEXT: ds_read_u8 v6, v0 offset:66 ; CI-NEXT: ds_read_u8 v0, v0 offset:65 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 @@ -1488,7 +1497,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(7) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v7 @@ -1505,7 +1514,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 9f191fa69f6549..06908d21e53556 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -23,7 +23,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -44,7 +44,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -60,7 +60,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -85,7 +85,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -105,7 +105,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -131,7 +131,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_volatile_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_f32_volatile_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -182,7 +182,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -199,7 +199,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace ; ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: ; kill: killed $vgpr4 @@ -229,7 +229,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg2_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -244,7 +244,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -268,7 +268,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_subreg4_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 @@ -283,7 +283,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_subreg4_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,7 +307,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_max_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -323,7 +323,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) ; ; GFX9-LABEL: simple_write2_two_val_max_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -348,7 +348,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_too_far_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -368,7 +368,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: simple_write2_two_val_too_far_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -413,7 +413,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; ; GFX9-LABEL: simple_write2_two_val_f32_x2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -450,7 +450,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -469,7 +469,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; ; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -506,21 +506,21 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; CI-LABEL: write2_ptr_subreg_arg_two_val_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x6 -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x6 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[4:5] +; CI-NEXT: s_mov_b64 s[0:1], s[4:5] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] -; CI-NEXT: s_mov_b64 s[2:3], s[10:11] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v1, s12 +; CI-NEXT: s_mov_b64 s[4:5], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v1, s8 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: v_mov_b32_e32 v3, s13 +; CI-NEXT: v_mov_b32_e32 v3, s9 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: ds_write_b32 v1, v2 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -529,14 +529,14 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C ; ; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_write_b32 v0, v1 offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -566,7 +566,7 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -580,7 +580,7 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -601,15 +601,15 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: misaligned_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x4 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[0:1], 0x4 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 @@ -618,11 +618,11 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) ; ; GFX9-LABEL: misaligned_simple_write2_one_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 @@ -642,15 +642,15 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 { ; CI-LABEL: unaligned_offset_simple_write2_one_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x4 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; CI-NEXT: s_load_dword s0, s[0:1], 0x4 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -675,11 +675,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 @@ -702,11 +702,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5 @@ -726,7 +726,7 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_two_val_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX9-LABEL: simple_write2_two_val_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc @@ -868,11 +868,11 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 { ; CI-LABEL: write2_sgemm_sequence: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[0:1], 0x0 -; CI-NEXT: s_lshl_b32 s1, s6, 2 +; CI-NEXT: s_lshl_b32 s1, s2, 2 ; CI-NEXT: s_add_i32 s2, s1, 0xc20 ; CI-NEXT: s_addk_i32 s1, 0xc60 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -890,8 +890,8 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, ; ; GFX9-LABEL: write2_sgemm_sequence: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_add_i32 s1, s2, 0xc20 @@ -945,12 +945,12 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: simple_write2_v4f32_superreg_align4: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2 -; CI-NEXT: s_load_dword s4, s[2:3], 0x0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; CI-NEXT: s_load_dword s4, s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 @@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1 @@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8 -; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4 -; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/early-inline.ll b/llvm/test/CodeGen/AMDGPU/early-inline.ll index 02ab2a065c0ef5..c1a049cf055cf0 100644 --- a/llvm/test/CodeGen/AMDGPU/early-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/early-inline.ll @@ -25,7 +25,6 @@ entry: ; CHECK-LABEL: @alias_caller( ; CHECK-NOT: call -; CHECK: {{^[}]}} define amdgpu_kernel void @alias_caller(i32 %x) { entry: %res = call i32 @c_alias(i32 %x) diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll index 554cb140f42923..d958dde01c3f85 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll @@ -80,11 +80,9 @@ ; R600-NOT: .amd_amdgpu_hsa_metadata ; R600-NOT: .amd_amdgpu_pal_metadata -define amdgpu_kernel void @elf_notes() #0 { +define amdgpu_kernel void @elf_notes() { ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 32b9f9cb97095f..86ec6269b1c9bc 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -6,10 +6,10 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: s_load_dword s14, s[6:7], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s13 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 +; CHECK-NEXT: s_add_u32 s24, s24, s7 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 @@ -24,7 +24,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s14, 8 +; CHECK-NEXT: s_bitcmp1_b32 s6, 8 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] ; CHECK-NEXT: s_cselect_b64 s[14:15], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index 54fb1dc5c05274..c744ace37a8315 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -93,7 +93,7 @@ bb: define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb -; GFX7-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX9-LABEL: s_add_co_br_user: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s2, s2 ; GFX9-NEXT: s_cmp_lt_u32 s0, s2 @@ -146,7 +146,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX10-LABEL: s_add_co_br_user: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_i32 s1, s0, s0 ; GFX10-NEXT: s_cmp_lt_u32 s1, s0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; ; GFX11-LABEL: s_add_co_br_user: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s1, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index ee1df9aa0d6cea..db3ea4df52981c 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -180,8 +180,8 @@ entry: } ; GCN-LABEL: {{^}}float8_extelt: -; GCN-DAG: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[2:3], 0x2c +; GCN-DAG: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[0:1], 0x2c ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 @@ -411,10 +411,10 @@ entry: ; GCN-LABEL: {{^}}bit4_extelt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, 3 +; GCN-NEXT: s_lshl_b32 s2, s2, 3 ; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index f4ec16db55d68a..70011e56d016e0 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: extract_vector_elt_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -36,7 +36,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: extract_vector_elt_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -62,8 +62,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s0, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s1, s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -77,8 +77,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,8 +95,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 @@ -119,8 +119,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -139,15 +139,15 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dword v2, v[1:2] -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -162,14 +162,12 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 ; ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -195,7 +193,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,7 +208,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; VI-LABEL: extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -224,7 +222,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; GFX11-LABEL: extract_vector_elt_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -249,8 +247,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b32 s4, s4, 4 @@ -264,8 +262,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: dynamic_extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -280,8 +278,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s4, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -302,7 +300,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_extractelement_v4f16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -319,7 +317,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_extractelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -338,9 +336,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_extractelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -363,7 +359,7 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -384,7 +380,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,21 +403,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -439,7 +434,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_01: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -456,7 +451,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_01: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -473,7 +468,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +495,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { ; SI-LABEL: reduce_load_vector_v8f16_extract_23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s0, s[0:1], 0x1 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -517,7 +512,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; VI-LABEL: reduce_load_vector_v8f16_extract_23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -534,7 +529,7 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) ; ; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -561,8 +556,8 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 @@ -613,8 +608,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -656,46 +651,43 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7] +; GFX11-NEXT: global_load_b128 v[1:4], v1, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -712,8 +704,8 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { ; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 @@ -802,8 +794,8 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -877,81 +869,78 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; ; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[6:7] offset:16 +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7] offset:16 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 1 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX11-NEXT: s_cmp_eq_u32 s0, 9 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; GFX11-NEXT: s_cmp_eq_u32 s0, 11 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v7 ; GFX11-NEXT: s_cmp_eq_u32 s0, 13 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s0, 14 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-NEXT: s_cmp_eq_u32 s0, 15 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index d670d69947361c..b69852da247445 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -81,7 +81,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x ; SI: buffer_store_short ; SI: buffer_store_short -; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[2:3], 0x24 +; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[0:1], 0x24 ; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]] ; GFX89-DAG: buffer_store_short [[VLOAD0]], off ; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]] @@ -100,9 +100,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x ; SI: s_load_dwordx2 s ; SI: s_load_dwordx2 s -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x24 -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[2:3], 0x4c -; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[2:3], 0x54 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x24 +; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x4c +; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54 ; GCN-NOT: {{buffer|flat|global}} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index 164352ef75b3b9..331fe26160d412 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -133,8 +133,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; isTypeDesirableForOp in SimplifyDemandedBits ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c -; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -147,8 +147,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c -; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c +; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28 ; VI-NOT: {{flat|buffer|global}} ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]] @@ -162,7 +162,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x30 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 @@ -179,7 +179,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: -; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x10 +; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index 06da7eea0b47dc..d5464ce6aa8a33 100644 --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -8,8 +8,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -38,8 +38,8 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, double %b) { ; GCN-LABEL: bitcast_fp_to_vector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -68,8 +68,8 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) { ; GCN-LABEL: bitcast_int_to_fpvector_extract_0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %o define amdgpu_kernel void @no_extract_volatile_load_extract0(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -122,7 +122,7 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_extract2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: no_extract_volatile_load_extract2: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -146,8 +146,8 @@ entry: define amdgpu_kernel void @no_extract_volatile_load_dynextract(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) { ; GCN-LABEL: no_extract_volatile_load_dynextract: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s12, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 21799ab79b8396..f34824cd6cefe1 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: s_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,8 +23,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: s_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,8 +35,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: s_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -47,10 +47,10 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -66,8 +66,8 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: s_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: s_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: s_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff @@ -102,10 +102,10 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: s_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -120,8 +120,8 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -132,8 +132,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; VI-LABEL: s_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +144,8 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; ; GFX9-LABEL: s_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -156,10 +156,10 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GFX11-LABEL: s_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -174,7 +174,7 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -187,7 +187,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -200,7 +200,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX9-LABEL: s_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -212,7 +212,7 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX11-LABEL: s_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff @@ -231,12 +231,12 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) { ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; VI-LABEL: fabs_fold_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -260,8 +260,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; ; GFX9-LABEL: fabs_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -273,13 +273,13 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-LABEL: fabs_fold_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2 +; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -293,7 +293,7 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -307,7 +307,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -332,9 +332,7 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fabs_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -356,8 +354,8 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fabs_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -368,8 +366,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; VI-LABEL: fabs_free_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -380,8 +378,8 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fabs_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -392,10 +390,10 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fabs_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -413,7 +411,7 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fabs_fold_self_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -439,7 +437,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: v_fabs_fold_self_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -457,7 +455,7 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_fabs_fold_self_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -470,15 +468,14 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_fabs_fold_self_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -496,8 +493,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 { ; CI-LABEL: v_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -524,8 +521,8 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -545,30 +542,28 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 @@ -587,7 +582,7 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -610,7 +605,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; VI-LABEL: v_extract_fabs_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -629,7 +624,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fabs_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -645,9 +640,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fabs_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -680,7 +673,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fabs_no_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,7 +691,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; VI-LABEL: v_extract_fabs_no_fold_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -716,7 +709,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -730,9 +723,7 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 60e19dcd48f1e6..07581ade57ccd5 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fabsf_free: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_free: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %bc= bitcast i32 %in to float @@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_bitset0_b32 s0, 31 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in) @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fabs_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 @@ -131,26 +131,26 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset0_b32 s7, 31 -; SI-NEXT: s_bitset0_b32 s6, 31 -; SI-NEXT: s_bitset0_b32 s5, 31 -; SI-NEXT: s_bitset0_b32 s4, 31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_bitset0_b32 s2, 31 +; SI-NEXT: s_bitset0_b32 s1, 31 +; SI-NEXT: s_bitset0_b32 s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_bitset0_b32 s3, 31 @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) { ; SI-LABEL: fabs_fold: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i ; ; VI-LABEL: fabs_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 @@ -232,23 +232,23 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e64 v0, |s4|, 1.0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bitpreserve_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %in.bc = bitcast float %in to i32 diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index 7352fcdd071d5b..cdc6b5a48d0a69 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @fadd_f16( ; SI-LABEL: fadd_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -35,8 +35,8 @@ define amdgpu_kernel void @fadd_f16( ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s2, s10 @@ -59,8 +59,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-SDAG-LABEL: fadd_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s3, s11 @@ -87,8 +87,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-GISEL-LABEL: fadd_f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -111,8 +111,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-SDAG-LABEL: fadd_f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11 @@ -137,8 +137,8 @@ define amdgpu_kernel void @fadd_f16( ; GFX11-FAKE16-GISEL-LABEL: fadd_f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -196,7 +196,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_a( ; SI-LABEL: fadd_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -216,7 +216,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; VI-LABEL: fadd_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -234,7 +234,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -256,7 +256,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -274,7 +274,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -294,7 +294,7 @@ define amdgpu_kernel void @fadd_f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -340,7 +340,7 @@ entry: define amdgpu_kernel void @fadd_f16_imm_b( ; SI-LABEL: fadd_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -360,7 +360,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; VI-LABEL: fadd_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -378,7 +378,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -400,7 +400,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -418,7 +418,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -438,7 +438,7 @@ define amdgpu_kernel void @fadd_f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -484,8 +484,8 @@ entry: define amdgpu_kernel void @fadd_v2f16( ; SI-LABEL: fadd_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -518,8 +518,8 @@ define amdgpu_kernel void @fadd_v2f16( ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -544,13 +544,11 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-SDAG-LABEL: fadd_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] @@ -567,10 +565,8 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-GISEL-LABEL: fadd_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 @@ -588,13 +584,11 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[2:3], 0x34 -; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34 +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1 ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] @@ -611,10 +605,8 @@ define amdgpu_kernel void @fadd_v2f16( ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1 @@ -665,7 +657,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_a( ; SI-LABEL: fadd_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -692,7 +684,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; VI-LABEL: fadd_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -713,12 +705,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 @@ -732,9 +722,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -749,12 +737,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0 @@ -768,9 +754,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -812,7 +796,7 @@ entry: define amdgpu_kernel void @fadd_v2f16_imm_b( ; SI-LABEL: fadd_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -839,7 +823,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; VI-LABEL: fadd_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -860,12 +844,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 @@ -879,9 +861,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] @@ -896,12 +876,10 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry -; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0 @@ -915,9 +893,7 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( ; ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b: ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry -; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 7252c69cb1cf75..4bfaa6e90bdfee 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -73,7 +73,7 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -89,7 +89,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -99,7 +99,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -108,7 +108,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -127,7 +127,7 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -219,7 +219,7 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -246,7 +246,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -256,7 +256,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -265,7 +265,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -274,7 +274,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -284,7 +284,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -358,7 +358,7 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 { define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 { ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: -; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -368,7 +368,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -378,7 +378,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX9-LABEL: global_store_2xi16_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -387,7 +387,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX10-LABEL: global_store_2xi16_align4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -396,7 +396,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX11-LABEL: global_store_2xi16_align4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -406,7 +406,7 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; ; GFX12-LABEL: global_store_2xi16_align4: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 7af972b96ec68c..581b7b4cff9ed0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -21,7 +21,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_undef_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_undef_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_undef_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -76,7 +76,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; CI-LABEL: v_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] @@ -119,10 +119,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v2, s4, s4 +; VI-NEXT: v_max_f16_e64 v2, s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -130,35 +130,34 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; ; GFX9-LABEL: s_test_canonicalize_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e64 v1, s4, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_canonicalize_var_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v1, s4, s4 +; GFX11-NEXT: v_max_f16_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -169,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ret void } +define half @s_test_canonicalize_arg(half %x) #1 { +; VI-LABEL: s_test_canonicalize_arg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_test_canonicalize_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: s_test_canonicalize_arg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_test_canonicalize_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %canonicalized = call half @llvm.canonicalize.f16(half %x) + ret half %canonicalized +} + define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { ; VI-LABEL: v_test_canonicalize_build_vector_v2f16: ; VI: ; %bb.0: @@ -211,7 +239,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -223,7 +251,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -234,7 +262,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -267,7 +295,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -279,7 +307,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -290,7 +318,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -303,7 +331,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -324,7 +352,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -336,7 +364,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -347,7 +375,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; CI-LABEL: v_test_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -360,7 +388,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -380,7 +408,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -392,7 +420,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -403,7 +431,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -416,7 +444,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -436,7 +464,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 { ; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -448,7 +476,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -459,7 +487,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -472,7 +500,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; ; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -493,7 +521,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -503,7 +531,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v0, v0, s[0:1] @@ -511,7 +539,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -521,7 +549,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] @@ -536,7 +564,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -546,7 +574,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -555,7 +583,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -565,7 +593,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -580,7 +608,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -590,7 +618,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -599,7 +627,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_p1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -609,7 +637,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -624,7 +652,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -634,7 +662,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -643,7 +671,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; CI-LABEL: test_fold_canonicalize_n1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -653,7 +681,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -668,7 +696,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -678,7 +706,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -687,7 +715,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; CI-LABEL: test_fold_canonicalize_literal_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c00 @@ -697,7 +725,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -712,7 +740,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -722,7 +750,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +759,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -741,7 +769,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -756,7 +784,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -766,7 +794,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -775,7 +803,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff @@ -785,7 +813,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -800,7 +828,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -810,7 +838,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -819,7 +847,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -829,7 +857,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; ; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -844,7 +872,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -854,7 +882,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -863,7 +891,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff @@ -873,7 +901,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -888,7 +916,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -898,7 +926,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -907,7 +935,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_qnan_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c00 @@ -917,7 +945,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -932,7 +960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -942,7 +970,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -951,7 +979,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -961,7 +989,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -976,7 +1004,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -986,7 +1014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -995,7 +1023,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1005,7 +1033,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1020,7 +1048,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1030,7 +1058,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1039,7 +1067,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan0_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1049,7 +1077,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1064,7 +1092,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1074,7 +1102,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1083,7 +1111,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan1_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1093,7 +1121,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1108,7 +1136,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1118,7 +1146,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1127,7 +1155,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan2_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1137,7 +1165,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1152,7 +1180,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1162,7 +1190,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1171,7 +1199,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; CI-LABEL: test_fold_canonicalize_snan3_value_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 @@ -1181,7 +1209,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -1196,7 +1224,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1214,7 +1242,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1226,7 +1254,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; CI-LABEL: v_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1249,10 +1277,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1272,7 +1298,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1290,7 +1316,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1303,7 +1329,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1326,14 +1352,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1351,7 +1376,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1369,7 +1394,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1382,7 +1407,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1406,14 +1431,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -1432,7 +1456,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1450,7 +1474,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1462,7 +1486,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; CI-LABEL: v_test_canonicalize_fneg_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: s_mov_b32 s7, s3 @@ -1486,10 +1510,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1510,12 +1532,12 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { ; VI-LABEL: s_test_canonicalize_var_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_max_f16_e64 v0, s4, s4 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_max_f16_e64 v0, s2, s2 ; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1525,40 +1547,39 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_canonicalize_var_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; CI-LABEL: s_test_canonicalize_var_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_lshr_b32 s3, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_canonicalize_var_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1572,7 +1593,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1582,7 +1603,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -1590,7 +1611,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -1600,7 +1621,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1615,7 +1636,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x80008000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1625,7 +1646,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,7 +1655,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x80008000 @@ -1644,7 +1665,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1659,7 +1680,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_p1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1669,7 +1690,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1699,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_p1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3c003c00 @@ -1688,7 +1709,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_p1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1703,7 +1724,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_n1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1713,7 +1734,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1722,7 +1743,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: test_fold_canonicalize_n1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 @@ -1732,7 +1753,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_n1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1747,7 +1768,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_literal_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1757,7 +1778,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX9-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1766,7 +1787,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; CI-LABEL: test_fold_canonicalize_literal_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x4c004c00 @@ -1776,7 +1797,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) ; ; GFX11-LABEL: test_fold_canonicalize_literal_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1791,7 +1812,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1801,7 +1822,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1810,7 +1831,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1820,7 +1841,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1835,7 +1856,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1845,7 +1866,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,7 +1875,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x3ff03ff @@ -1864,7 +1885,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1879,7 +1900,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1889,7 +1910,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1898,7 +1919,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1908,7 +1929,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1923,7 +1944,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 { ; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1933,7 +1954,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1942,7 +1963,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x83ff83ff @@ -1952,7 +1973,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1967,7 +1988,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1977,7 +1998,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1986,7 +2007,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; CI-LABEL: test_fold_canonicalize_qnan_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7c007c00 @@ -1996,7 +2017,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o ; ; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2011,7 +2032,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2021,7 +2042,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2030,7 +2051,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2040,7 +2061,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2055,7 +2076,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2065,7 +2086,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2074,7 +2095,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2084,7 +2105,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2099,7 +2120,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2109,7 +2130,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2118,7 +2139,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2128,7 +2149,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2143,7 +2164,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2153,7 +2174,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2162,7 +2183,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2172,7 +2193,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2187,7 +2208,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2197,7 +2218,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2206,7 +2227,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2216,7 +2237,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2231,7 +2252,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2241,7 +2262,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2250,7 +2271,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e007e00 @@ -2260,7 +2281,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2355,7 +2376,7 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2365,7 +2386,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -2373,7 +2394,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_mov_b32_e32 v0, 0 @@ -2383,7 +2404,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -2657,7 +2678,7 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 { ; VI-LABEL: s_test_canonicalize_undef_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2668,7 +2689,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX9-LABEL: s_test_canonicalize_undef_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2677,7 +2698,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; CI-LABEL: s_test_canonicalize_undef_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -2688,7 +2709,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out ; ; GFX11-LABEL: s_test_canonicalize_undef_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index f0ce96af90649d..d53c0411ad88c1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -23,7 +23,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -35,7 +35,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -46,7 +46,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -59,7 +59,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -78,8 +78,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[6:7], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -89,8 +89,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX8-LABEL: s_test_canonicalize_var_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX9-LABEL: s_test_canonicalize_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -111,11 +111,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX11-LABEL: s_test_canonicalize_var_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -123,7 +123,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX12-LABEL: s_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -162,7 +162,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -175,7 +175,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -195,7 +195,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -207,7 +207,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -218,7 +218,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -275,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -308,7 +308,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -318,7 +318,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX9-LABEL: test_fold_canonicalize_undef_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -326,7 +326,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX11-LABEL: test_fold_canonicalize_undef_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -336,7 +336,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; ; GFX12-LABEL: test_fold_canonicalize_undef_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -351,7 +351,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -361,7 +361,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -379,7 +379,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -413,7 +413,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -440,7 +440,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -469,7 +469,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -494,7 +494,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -503,7 +503,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -513,7 +513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -528,7 +528,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,7 +557,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -572,7 +572,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -582,7 +582,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] @@ -590,7 +590,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] @@ -615,7 +615,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -636,7 +636,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -647,7 +647,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -663,7 +663,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -674,7 +674,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -695,7 +695,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -711,7 +711,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -722,7 +722,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s2, 0x7fffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 @@ -732,7 +732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -743,7 +743,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -769,7 +769,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -788,7 +788,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -803,7 +803,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -822,7 +822,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -833,7 +833,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -859,7 +859,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -893,7 +893,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -903,7 +903,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -912,7 +912,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -922,7 +922,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -937,7 +937,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -966,7 +966,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -981,7 +981,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -991,7 +991,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1069,7 +1069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1142,7 +1142,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1176,7 +1176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1186,7 +1186,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX9-LABEL: v_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1224,7 +1224,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX11-LABEL: v_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; ; GFX12-LABEL: v_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1276,7 +1276,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX9-LABEL: s_test_canonicalize_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX11-LABEL: s_test_canonicalize_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; ; GFX12-LABEL: s_test_canonicalize_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1324,7 +1324,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1368,7 +1368,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; ; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1437,7 +1437,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -1448,7 +1448,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; ; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1501,7 +1501,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1530,7 +1530,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,7 +1541,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1550,7 +1550,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1577,7 +1577,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1588,7 +1588,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_p1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1597,7 +1597,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_p1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1607,7 +1607,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_p1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1622,7 +1622,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: test_fold_canonicalize_n1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1642,7 +1642,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: test_fold_canonicalize_n1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: test_fold_canonicalize_n1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1667,7 +1667,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1678,7 +1678,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX9-LABEL: test_fold_canonicalize_literal_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1687,7 +1687,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX11-LABEL: test_fold_canonicalize_literal_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; ; GFX12-LABEL: test_fold_canonicalize_literal_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1712,7 +1712,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v1, v0 @@ -1761,7 +1761,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,7 +1772,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1793,7 +1793,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1840,7 +1840,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; ; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1856,7 +1856,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1867,7 +1867,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1888,7 +1888,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; ; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff ; GFX12-NEXT: v_mov_b32_e32 v0, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1904,7 +1904,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1915,7 +1915,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX9-LABEL: test_fold_canonicalize_qnan_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX11-LABEL: test_fold_canonicalize_qnan_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1934,7 +1934,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; ; GFX12-LABEL: test_fold_canonicalize_qnan_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1949,7 +1949,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -1960,7 +1960,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1979,7 +1979,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2005,7 +2005,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2014,7 +2014,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2024,7 +2024,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; ; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2039,7 +2039,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2069,7 +2069,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2114,7 +2114,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2140,7 +2140,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2149,7 +2149,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2159,7 +2159,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2174,7 +2174,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: -; GFX678-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,7 +2185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2194,7 +2194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; ; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f64_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f64_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f64_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2264,9 +2264,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f64_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2279,9 +2277,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f64_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2303,7 +2299,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f32_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2320,7 +2316,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f32_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2337,7 +2333,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f32_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2348,9 +2344,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f32_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2363,9 +2357,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f32_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2387,7 +2379,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2405,7 +2397,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX8-LABEL: test_canonicalize_value_f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2422,7 +2414,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX9-LABEL: test_canonicalize_value_f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2433,9 +2425,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX11-LABEL: test_canonicalize_value_f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2448,9 +2438,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; ; GFX12-LABEL: test_canonicalize_value_f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2473,7 +2461,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { ; GFX6-LABEL: test_canonicalize_value_v2f16_flush: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2496,7 +2484,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_flush: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2516,7 +2504,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_flush: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2527,9 +2515,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_flush: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2542,9 +2528,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_flush: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2566,7 +2550,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f64_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2583,7 +2567,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f64_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2600,7 +2584,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f64_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -2611,9 +2595,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f64_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2626,9 +2608,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f64_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -2650,7 +2630,7 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f32_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2667,7 +2647,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f32_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2684,7 +2664,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f32_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2695,9 +2675,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f32_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2710,9 +2688,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f32_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2735,7 +2711,7 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2753,7 +2729,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX8-LABEL: test_canonicalize_value_f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2770,7 +2746,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX9-LABEL: test_canonicalize_value_f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -2781,9 +2757,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX11-LABEL: test_canonicalize_value_f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2796,9 +2770,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; ; GFX12-LABEL: test_canonicalize_value_f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2822,7 +2794,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { ; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2845,7 +2817,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2864,7 +2836,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -2875,9 +2847,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2890,9 +2860,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; ; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] @@ -2914,7 +2882,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { ; GFX6-LABEL: v_test_canonicalize_var_v2f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -2931,7 +2899,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX8-LABEL: v_test_canonicalize_var_v2f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2948,7 +2916,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX9-LABEL: v_test_canonicalize_var_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2961,11 +2929,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX11-LABEL: v_test_canonicalize_var_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2978,11 +2944,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; ; GFX12-LABEL: v_test_canonicalize_var_v2f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll index 845b25a8f61bd7..7d8f43bbe16b73 100644 --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @fcmp_f16_lt( ; SI-LABEL: fcmp_f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -32,33 +32,33 @@ define amdgpu_kernel void @fcmp_f16_lt( ; ; VI-LABEL: fcmp_f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -95,8 +95,8 @@ entry: define amdgpu_kernel void @fcmp_f16_lt_abs( ; SI-LABEL: fcmp_f16_lt_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -121,33 +121,33 @@ define amdgpu_kernel void @fcmp_f16_lt_abs( ; ; VI-LABEL: fcmp_f16_lt_abs: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 -; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], |v0|, |v1| -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_lt_f16_e64 s[4:5], |v0|, |v1| +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lt_abs: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -187,8 +187,8 @@ entry: define amdgpu_kernel void @fcmp_f16_eq( ; SI-LABEL: fcmp_f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -213,33 +213,33 @@ define amdgpu_kernel void @fcmp_f16_eq( ; ; VI-LABEL: fcmp_f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -276,8 +276,8 @@ entry: define amdgpu_kernel void @fcmp_f16_le( ; SI-LABEL: fcmp_f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -302,33 +302,33 @@ define amdgpu_kernel void @fcmp_f16_le( ; ; VI-LABEL: fcmp_f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -365,8 +365,8 @@ entry: define amdgpu_kernel void @fcmp_f16_gt( ; SI-LABEL: fcmp_f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -391,33 +391,33 @@ define amdgpu_kernel void @fcmp_f16_gt( ; ; VI-LABEL: fcmp_f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -454,8 +454,8 @@ entry: define amdgpu_kernel void @fcmp_f16_lg( ; SI-LABEL: fcmp_f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -480,33 +480,33 @@ define amdgpu_kernel void @fcmp_f16_lg( ; ; VI-LABEL: fcmp_f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -543,8 +543,8 @@ entry: define amdgpu_kernel void @fcmp_f16_ge( ; SI-LABEL: fcmp_f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -569,33 +569,33 @@ define amdgpu_kernel void @fcmp_f16_ge( ; ; VI-LABEL: fcmp_f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -632,8 +632,8 @@ entry: define amdgpu_kernel void @fcmp_f16_o( ; SI-LABEL: fcmp_f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -658,33 +658,33 @@ define amdgpu_kernel void @fcmp_f16_o( ; ; VI-LABEL: fcmp_f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -721,8 +721,8 @@ entry: define amdgpu_kernel void @fcmp_f16_u( ; SI-LABEL: fcmp_f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -747,33 +747,33 @@ define amdgpu_kernel void @fcmp_f16_u( ; ; VI-LABEL: fcmp_f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -810,8 +810,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nge( ; SI-LABEL: fcmp_f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -836,33 +836,33 @@ define amdgpu_kernel void @fcmp_f16_nge( ; ; VI-LABEL: fcmp_f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -899,8 +899,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nlg( ; SI-LABEL: fcmp_f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -925,33 +925,33 @@ define amdgpu_kernel void @fcmp_f16_nlg( ; ; VI-LABEL: fcmp_f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -988,8 +988,8 @@ entry: define amdgpu_kernel void @fcmp_f16_ngt( ; SI-LABEL: fcmp_f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1014,33 +1014,33 @@ define amdgpu_kernel void @fcmp_f16_ngt( ; ; VI-LABEL: fcmp_f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1077,8 +1077,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nle( ; SI-LABEL: fcmp_f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1103,33 +1103,33 @@ define amdgpu_kernel void @fcmp_f16_nle( ; ; VI-LABEL: fcmp_f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1166,8 +1166,8 @@ entry: define amdgpu_kernel void @fcmp_f16_neq( ; SI-LABEL: fcmp_f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1192,33 +1192,33 @@ define amdgpu_kernel void @fcmp_f16_neq( ; ; VI-LABEL: fcmp_f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1255,8 +1255,8 @@ entry: define amdgpu_kernel void @fcmp_f16_nlt( ; SI-LABEL: fcmp_f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1281,33 +1281,33 @@ define amdgpu_kernel void @fcmp_f16_nlt( ; ; VI-LABEL: fcmp_f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1344,8 +1344,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lt( ; SI-LABEL: fcmp_v2f16_lt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1376,21 +1376,21 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; ; VI-LABEL: fcmp_v2f16_lt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1399,14 +1399,14 @@ define amdgpu_kernel void @fcmp_v2f16_lt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1449,8 +1449,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_eq( ; SI-LABEL: fcmp_v2f16_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1481,21 +1481,21 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; ; VI-LABEL: fcmp_v2f16_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1504,14 +1504,14 @@ define amdgpu_kernel void @fcmp_v2f16_eq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_eq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1553,8 +1553,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_le( ; SI-LABEL: fcmp_v2f16_le: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1585,21 +1585,21 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; ; VI-LABEL: fcmp_v2f16_le: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1608,14 +1608,14 @@ define amdgpu_kernel void @fcmp_v2f16_le( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_le: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1657,8 +1657,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_gt( ; SI-LABEL: fcmp_v2f16_gt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1689,21 +1689,21 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; ; VI-LABEL: fcmp_v2f16_gt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1712,14 +1712,14 @@ define amdgpu_kernel void @fcmp_v2f16_gt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_gt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_lg( ; SI-LABEL: fcmp_v2f16_lg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1794,21 +1794,21 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; ; VI-LABEL: fcmp_v2f16_lg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1817,14 +1817,14 @@ define amdgpu_kernel void @fcmp_v2f16_lg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_lg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1867,8 +1867,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ge( ; SI-LABEL: fcmp_v2f16_ge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1899,21 +1899,21 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; ; VI-LABEL: fcmp_v2f16_ge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1922,14 +1922,14 @@ define amdgpu_kernel void @fcmp_v2f16_ge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -1972,8 +1972,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_o( ; SI-LABEL: fcmp_v2f16_o: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2004,21 +2004,21 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; ; VI-LABEL: fcmp_v2f16_o: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2027,14 +2027,14 @@ define amdgpu_kernel void @fcmp_v2f16_o( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_o: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2077,8 +2077,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_u( ; SI-LABEL: fcmp_v2f16_u: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2109,21 +2109,21 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; ; VI-LABEL: fcmp_v2f16_u: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2132,14 +2132,14 @@ define amdgpu_kernel void @fcmp_v2f16_u( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_u: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2181,8 +2181,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nge( ; SI-LABEL: fcmp_v2f16_nge: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2213,21 +2213,21 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; ; VI-LABEL: fcmp_v2f16_nge: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2236,14 +2236,14 @@ define amdgpu_kernel void @fcmp_v2f16_nge( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nge: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2285,8 +2285,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlg( ; SI-LABEL: fcmp_v2f16_nlg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2317,21 +2317,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; ; VI-LABEL: fcmp_v2f16_nlg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2340,14 +2340,14 @@ define amdgpu_kernel void @fcmp_v2f16_nlg( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlg: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2390,8 +2390,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_ngt( ; SI-LABEL: fcmp_v2f16_ngt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2422,21 +2422,21 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; ; VI-LABEL: fcmp_v2f16_ngt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2445,14 +2445,14 @@ define amdgpu_kernel void @fcmp_v2f16_ngt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_ngt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2494,8 +2494,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nle( ; SI-LABEL: fcmp_v2f16_nle: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2526,21 +2526,21 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; ; VI-LABEL: fcmp_v2f16_nle: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2549,14 +2549,14 @@ define amdgpu_kernel void @fcmp_v2f16_nle( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nle: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2598,8 +2598,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_neq( ; SI-LABEL: fcmp_v2f16_neq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2630,21 +2630,21 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; ; VI-LABEL: fcmp_v2f16_neq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2653,14 +2653,14 @@ define amdgpu_kernel void @fcmp_v2f16_neq( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_neq: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2702,8 +2702,8 @@ entry: define amdgpu_kernel void @fcmp_v2f16_nlt( ; SI-LABEL: fcmp_v2f16_nlt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -2734,21 +2734,21 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; ; VI-LABEL: fcmp_v2f16_nlt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2757,14 +2757,14 @@ define amdgpu_kernel void @fcmp_v2f16_nlt( ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_v2f16_nlt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index bd483f4c070713..eda1709e4fd595 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -15,31 +15,30 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_movk_i32 s2, 0x7fff +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s3, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_bfi_b32 v2, s3, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -47,29 +46,29 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; ; GFX9-LABEL: s_copysign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -82,8 +81,8 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -94,10 +93,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -106,22 +105,22 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -136,8 +135,8 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -148,10 +147,10 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; VI-LABEL: s_test_copysign_f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -160,22 +159,22 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma ; ; GFX9-LABEL: s_test_copysign_f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -190,8 +189,8 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -202,10 +201,10 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -214,22 +213,22 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_10.0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, 0x7fff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -244,8 +243,8 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -256,10 +255,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -268,22 +267,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -298,8 +297,8 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -310,10 +309,10 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; VI-LABEL: s_test_copysign_f16_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x8000 +; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -322,22 +321,22 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_neg10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b32 s2, s4, 0x8000 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_or_b32 s0, s4, 0x8000 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -352,26 +351,25 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, s4, v0 +; VI-NEXT: v_and_b32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -379,23 +377,23 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_0_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -409,26 +407,25 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -437,24 +434,24 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half ; ; GFX9-LABEL: s_test_copysign_f16_1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -469,27 +466,26 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -498,24 +494,24 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal ; ; GFX9-LABEL: s_test_copysign_f16_10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -530,26 +526,25 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -558,24 +553,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h ; ; GFX9-LABEL: s_test_copysign_f16_neg1_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -590,27 +585,26 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f16_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f16_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -619,24 +613,24 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_test_copysign_f16_neg10_mag: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f16_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4 +; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] @@ -829,8 +823,8 @@ define half @v_test_copysign_f16_neg10(half %mag) { define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -855,8 +849,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -880,17 +874,17 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] @@ -899,10 +893,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -931,8 +923,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -957,8 +949,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -982,15 +974,15 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] -; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 @@ -1001,12 +993,9 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] @@ -1035,8 +1024,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1060,8 +1049,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1085,14 +1074,14 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] ; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1104,10 +1093,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,8 +1123,8 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1163,8 +1150,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,14 +1175,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v1, s[0:1] ; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -1207,12 +1194,10 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v1, s[4:5] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -1239,8 +1224,8 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1266,8 +1251,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1291,14 +1276,14 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v1, s[0:1] ; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1310,10 +1295,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1342,35 +1325,35 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -1392,15 +1375,15 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1 @@ -1410,12 +1393,10 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1442,8 +1423,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s11 @@ -1471,8 +1452,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1496,17 +1477,17 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v1, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 ; GFX9-NEXT: global_store_short v2, v0, s[4:5] @@ -1515,10 +1496,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1547,8 +1526,8 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s4, s3, 8 @@ -1611,8 +1590,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s7, 8 ; VI-NEXT: s_and_b32 s1, s7, 0x1ff @@ -1669,8 +1648,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 @@ -1727,8 +1706,8 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff ; GFX11-NEXT: s_lshr_b32 s2, s7, 8 @@ -1798,7 +1777,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { ; SI-LABEL: s_copysign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,7 +1801,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; VI-LABEL: s_copysign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s4, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1842,7 +1821,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1861,7 +1840,7 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; GFX11-LABEL: s_copysign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s3 @@ -1887,8 +1866,8 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s4, 16 @@ -1915,8 +1894,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; VI-LABEL: s_copysign_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1944,33 +1923,33 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; ; GFX9-LABEL: s_copysign_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_bfi_b32 v2, s2, v2, v3 -; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 +; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s2, s6, 16 @@ -1999,8 +1978,8 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2034,8 +2013,8 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; VI-LABEL: s_copysign_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -2065,39 +2044,39 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; GFX9-LABEL: s_copysign_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s3, s7, 16 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 542d67486e7580..f48961c905f58f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { ; SI-LABEL: s_test_copysign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; VI-LABEL: s_test_copysign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -34,7 +34,7 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; GFX11-LABEL: s_test_copysign_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -51,8 +51,8 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -63,10 +63,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -94,8 +94,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -106,10 +106,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; ; VI-LABEL: s_test_copysign_f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -119,10 +119,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m ; GFX11-LABEL: s_test_copysign_f32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -137,8 +137,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_10.0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,10 +149,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_10.0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x7fffffff +; VI-NEXT: s_bitset0_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -162,10 +162,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_10.0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fffffff +; GFX11-NEXT: s_bitset0_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -180,8 +180,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,10 +192,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; ; VI-LABEL: s_test_copysign_f32_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -205,10 +205,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float ; GFX11-LABEL: s_test_copysign_f32_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -223,8 +223,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) { ; SI-LABEL: s_test_copysign_f32_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,10 +235,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -248,10 +248,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_bitset1_b32 s2, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -266,8 +266,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -278,10 +278,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -291,10 +291,10 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_0_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -323,10 +323,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; ; VI-LABEL: s_test_copysign_f32_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -337,10 +337,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa ; GFX11-LABEL: s_test_copysign_f32_1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -356,8 +356,8 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -369,10 +369,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; ; VI-LABEL: s_test_copysign_f32_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -383,10 +383,10 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo ; GFX11-LABEL: s_test_copysign_f32_10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -415,10 +415,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; ; VI-LABEL: s_test_copysign_f32_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -429,10 +429,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f ; GFX11-LABEL: s_test_copysign_f32_neg1_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -448,8 +448,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) { ; SI-LABEL: s_test_copysign_f32_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,10 +461,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0x80000000 +; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -475,10 +475,10 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_neg10_mag: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -494,8 +494,8 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_brev_b32 s8, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -511,8 +511,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 @@ -529,8 +529,8 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 @@ -549,40 +549,40 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) { ; SI-LABEL: s_test_copysign_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s7, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v0, s7, v0, v2 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_bfi_b32 v2, s7, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s7, -2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_bfi_b32 v2, s7, v0, v1 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_bfi_b32 v1, s7, v3, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v3, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v3, s8 -; VI-NEXT: v_bfi_b32 v0, s7, v0, v3 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v3 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] @@ -591,8 +591,8 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo ; GFX11-LABEL: s_test_copysign_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9 @@ -614,45 +614,45 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) { ; SI-LABEL: s_test_copysign_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_bfi_b32 v2, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v2, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_bfi_b32 v0, s12, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_brev_b32 s12, -2 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: v_bfi_b32 v2, s12, v2, v0 +; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_bfi_b32 v0, s12, v0, v4 +; VI-NEXT: v_bfi_b32 v0, s2, v0, v4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -661,8 +661,8 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo ; GFX11-LABEL: s_test_copysign_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10 @@ -906,46 +906,46 @@ define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) { ; SI-LABEL: s_test_copysign_f32_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s0, -2 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -958,7 +958,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -972,7 +972,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s3, 0x80000000 @@ -984,7 +984,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o ; ; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1003,7 +1003,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,7 +1018,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1050,24 +1050,23 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) { ; SI-LABEL: s_test_copysign_f32_1_fpext_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_or_b32_e32 v0, 1.0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_1_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000 ; VI-NEXT: s_or_b32 s2, s2, 1.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1079,10 +1078,10 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out ; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 @@ -1101,7 +1100,7 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) { ; SI-LABEL: s_test_copysign_f32_fpext_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,7 +1116,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 @@ -1130,7 +1129,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 4300faa02742a3..b5fa3fd9eccc13 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -11,49 +11,49 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) #0 define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) { ; SI-LABEL: s_test_copysign_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s6, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_brev_b32 s4, -2 -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -65,8 +65,8 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,28 +78,28 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 ; ; VI-LABEL: s_test_copysign_f64_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s1, 31 +; GFX11-NEXT: s_bitset0_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -111,8 +111,8 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,28 +124,28 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 ; ; VI-LABEL: s_test_copysign_f64_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s1, 31 +; GFX11-NEXT: s_bitset0_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -170,28 +170,28 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: s_test_copysign_f64_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset0_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset0_b32 s1, 31 +; GFX11-NEXT: s_bitset0_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -203,8 +203,8 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -216,28 +216,28 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: s_test_copysign_f64_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_neg1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s1, 31 +; GFX11-NEXT: s_bitset1_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -249,8 +249,8 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) { ; SI-LABEL: s_test_copysign_f64_neg10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -262,28 +262,28 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: s_test_copysign_f64_neg10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_neg10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s1, 31 +; GFX11-NEXT: s_bitset1_b32 s3, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -295,49 +295,49 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) { ; SI-LABEL: s_test_copysign_f64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dword s6, s[2:3], 0x1d -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dword s0, s[0:1], 0x1d +; SI-NEXT: s_brev_b32 s1, -2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_bfi_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dword s4, s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dword s4, s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s5, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_bfi_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -350,49 +350,49 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) { ; SI-LABEL: s_test_copysign_f64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s2, s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: s_brev_b32 s6, -2 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v1, s2, v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s5, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_bfi_b32 v1, s5, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x74 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -405,7 +405,7 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_0_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -419,7 +419,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_0_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -431,7 +431,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_0_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -463,7 +463,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; VI-LABEL: s_test_copysign_f64_1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -476,7 +476,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: s_test_copysign_f64_1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -494,7 +494,7 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -509,7 +509,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; VI-LABEL: s_test_copysign_f64_10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -522,7 +522,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou ; ; GFX11-LABEL: s_test_copysign_f64_10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -540,7 +540,7 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -555,7 +555,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; VI-LABEL: s_test_copysign_f64_neg1_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -568,7 +568,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d ; ; GFX11-LABEL: s_test_copysign_f64_neg1_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -586,7 +586,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, double %sign) { ; SI-LABEL: s_test_copysign_f64_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -601,7 +601,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f64_neg10_mag: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, ; ; GFX11-LABEL: s_test_copysign_f64_neg10_mag: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -632,36 +632,36 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) { ; SI-LABEL: s_test_copysign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s8, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s8, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s8, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s8, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_bfi_b32 v1, s8, v2, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -671,8 +671,8 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou ; GFX11-LABEL: s_test_copysign_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11 ; GFX11-NEXT: v_mov_b32_e32 v2, s9 @@ -693,46 +693,46 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { ; SI-LABEL: s_test_copysign_v3f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s10, -2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s10, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s10, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s10, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s10, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_bfi_b32 v1, s10, v2, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_bfi_b32 v5, s10, v0, v2 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -746,8 +746,8 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou ; GFX11-LABEL: s_test_copysign_v3f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15 ; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4 @@ -771,53 +771,54 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { ; SI-LABEL: s_test_copysign_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x11 -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v2, s19 -; SI-NEXT: v_bfi_b32 v7, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v7, s12, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s12, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s12, -2 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_bfi_b32 v1, s12, v2, v0 +; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v2, s19 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_bfi_b32 v7, s12, v0, v2 +; VI-NEXT: v_bfi_b32 v7, s2, v0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_bfi_b32 v5, s12, v0, v2 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v6, s10 ; VI-NEXT: v_mov_b32_e32 v8, s2 @@ -832,8 +833,8 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; GFX11-LABEL: s_test_copysign_v4f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15 ; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index f53d3cf33c9cc8..b14b6421f56b4e 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -10,20 +10,20 @@ define amdgpu_kernel void @v_fdiv_f16( ; SI-LABEL: v_fdiv_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2 @@ -46,8 +46,8 @@ define amdgpu_kernel void @v_fdiv_f16( ; ; GFX8-LABEL: v_fdiv_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -74,13 +74,13 @@ define amdgpu_kernel void @v_fdiv_f16( ; ; GFX9-LABEL: v_fdiv_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -92,13 +92,13 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX10-LABEL: v_fdiv_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -110,10 +110,8 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-LABEL: v_fdiv_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -149,7 +147,7 @@ entry: define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -180,7 +178,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rcp_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -197,7 +195,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rcp_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -208,7 +206,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX10-LABEL: v_rcp_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -219,9 +217,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX11-LABEL: v_rcp_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -245,7 +241,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_abs: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -276,7 +272,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_abs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -293,7 +289,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_abs: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -304,7 +300,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_abs: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -315,9 +311,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_abs: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -344,7 +338,7 @@ entry: define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: reciprocal_f16_rounded: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -375,7 +369,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX8-LABEL: reciprocal_f16_rounded: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -392,7 +386,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX9-LABEL: reciprocal_f16_rounded: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -403,7 +397,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX10-LABEL: reciprocal_f16_rounded: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -414,9 +408,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs ; ; GFX11-LABEL: reciprocal_f16_rounded: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -440,7 +432,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -458,7 +450,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -475,7 +467,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -486,7 +478,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_afn: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -497,9 +489,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_afn: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -523,7 +513,7 @@ entry: define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rcp_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -554,7 +544,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rcp_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -571,7 +561,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rcp_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -582,7 +572,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rcp_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -593,9 +583,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rcp_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -619,7 +607,7 @@ entry: define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -653,7 +641,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX8-LABEL: v_rsq_f16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -670,7 +658,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX9-LABEL: v_rsq_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -681,7 +669,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX10-LABEL: v_rsq_f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -692,9 +680,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) # ; ; GFX11-LABEL: v_rsq_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -719,7 +705,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_neg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -753,7 +739,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX8-LABEL: v_rsq_f16_neg: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -771,7 +757,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX9-LABEL: v_rsq_f16_neg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -783,7 +769,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX10-LABEL: v_rsq_f16_neg: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -795,9 +781,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) % ; ; GFX11-LABEL: v_rsq_f16_neg: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -824,7 +808,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_multi_use: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -860,7 +844,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX8-LABEL: v_rsq_f16_multi_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -879,7 +863,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX9-LABEL: v_rsq_f16_multi_use: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -892,7 +876,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX10-LABEL: v_rsq_f16_multi_use: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -905,9 +889,7 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac ; ; GFX11-LABEL: v_rsq_f16_multi_use: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -935,7 +917,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract0: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -969,7 +951,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -987,7 +969,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract0: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -999,7 +981,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX10-LABEL: v_rsq_f16_missing_contract0: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1011,9 +993,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr ; ; GFX11-LABEL: v_rsq_f16_missing_contract0: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1040,7 +1020,7 @@ entry: define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1074,7 +1054,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX8-LABEL: v_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1092,7 +1072,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX9-LABEL: v_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1104,7 +1084,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX10-LABEL: v_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1116,9 +1096,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr ; ; GFX11-LABEL: v_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1145,7 +1123,7 @@ entry: define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_neg_rsq_f16_missing_contract1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1179,7 +1157,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX8-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1197,7 +1175,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX9-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1209,7 +1187,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1221,9 +1199,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ; ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1250,20 +1226,20 @@ entry: define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { ; SI-LABEL: v_fdiv_f16_afn: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 @@ -1274,8 +1250,8 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; ; GFX8-LABEL: v_fdiv_f16_afn: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1298,13 +1274,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; ; GFX9-LABEL: v_fdiv_f16_afn: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1314,13 +1290,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; GFX10-LABEL: v_fdiv_f16_afn: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1330,10 +1306,8 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) ; GFX11-LABEL: v_fdiv_f16_afn: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -1363,20 +1337,20 @@ entry: define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #2 { ; SI-LABEL: v_fdiv_f16_unsafe: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[10:11], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_rcp_f32_e32 v3, v3 @@ -1387,8 +1361,8 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; ; GFX8-LABEL: v_fdiv_f16_unsafe: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1411,13 +1385,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; ; GFX9-LABEL: v_fdiv_f16_unsafe: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v2, v2 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1427,13 +1401,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; GFX10-LABEL: v_fdiv_f16_unsafe: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v2, v2 ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 @@ -1443,10 +1417,8 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace( ; GFX11-LABEL: v_fdiv_f16_unsafe: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc @@ -1476,7 +1448,7 @@ entry: define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_2_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1491,7 +1463,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_2_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1503,7 +1475,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_2_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1514,7 +1486,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_2_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1525,7 +1497,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_2_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 @@ -1543,7 +1515,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1558,7 +1530,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1570,7 +1542,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1581,7 +1553,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1592,7 +1564,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 @@ -1610,7 +1582,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; SI-LABEL: div_afn_neg_k_x_pat_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1625,7 +1597,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8-LABEL: div_afn_neg_k_x_pat_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1637,7 +1609,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX9-LABEL: div_afn_neg_k_x_pat_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1648,7 +1620,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX10-LABEL: div_afn_neg_k_x_pat_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0 @@ -1659,7 +1631,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX11-LABEL: div_afn_neg_k_x_pat_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index c6b730e3fd5d6f..0468175c5df50d 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ninf: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -68,7 +68,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ninf: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -118,7 +118,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -181,7 +181,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_ieee: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +229,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX7-LABEL: s_fdiv_f32_ieee: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -253,7 +253,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ieee: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -275,7 +275,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 @@ -294,7 +294,7 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX11-LABEL: s_fdiv_f32_ieee: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 @@ -334,7 +334,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_25ulp_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX67-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 @@ -353,7 +353,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX8-LABEL: s_fdiv_25ulp_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -384,7 +384,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX11-LABEL: s_fdiv_25ulp_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| @@ -420,7 +420,7 @@ entry: define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX6-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -446,7 +446,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 @@ -482,7 +482,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -498,7 +498,7 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 @@ -535,7 +535,7 @@ entry: define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { ; GFX67-LABEL: s_fdiv_fast_ieee_f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -548,7 +548,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -559,7 +559,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -569,7 +569,7 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -599,7 +599,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -612,7 +612,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -623,7 +623,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -633,7 +633,7 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -687,7 +687,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -697,7 +697,7 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -727,7 +727,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -753,7 +753,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_arcp_daz: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX7-LABEL: s_fdiv_f32_arcp_daz: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_daz: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 @@ -829,7 +829,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -850,7 +850,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 @@ -892,7 +892,7 @@ entry: define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX67-LABEL: s_fdiv_f32_arcp_ninf: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -905,7 +905,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -916,7 +916,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s3 @@ -926,7 +926,7 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -956,8 +956,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -996,10 +996,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s4 @@ -1012,11 +1013,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 @@ -1031,14 +1031,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s6, v4 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1077,10 +1076,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, s5, v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 @@ -1093,11 +1093,10 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s6, s6, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s6, s6, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 ; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1109,7 +1108,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4 @@ -1118,10 +1116,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s5 +; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1131,9 +1130,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s0, s6, s6, s4 +; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5 ; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4 @@ -1155,8 +1153,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5 ; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5 @@ -1214,8 +1212,8 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,8 +1226,8 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s6 ; GFX8-NEXT: v_rcp_f32_e32 v1, s7 @@ -1243,22 +1241,22 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, s7 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, s7 @@ -1292,8 +1290,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1306,8 +1304,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1321,22 +1319,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1370,8 +1368,8 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX67-NEXT: s_mov_b32 s3, 0xf000 ; GFX67-NEXT: s_mov_b32 s2, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) @@ -1384,8 +1382,8 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s7 ; GFX8-NEXT: v_rcp_f32_e32 v2, s6 @@ -1399,22 +1397,22 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v0, s7 ; GFX10-NEXT: v_rcp_f32_e32 v2, s6 ; GFX10-NEXT: v_mul_f32_e32 v1, s5, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, s4, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: v_rcp_f32_e32 v2, s6 @@ -1448,7 +1446,7 @@ entry: define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v4f32: ; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 @@ -1519,7 +1517,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v4f32: ; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1590,7 +1588,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX7-LABEL: s_fdiv_v4f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 @@ -1661,7 +1659,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: s_fdiv_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1732,7 +1730,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: s_fdiv_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,7 +1792,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: s_fdiv_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1895,7 +1893,7 @@ define amdgpu_kernel void @s_fdiv_v4f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_fast_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -1914,7 +1912,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_fast_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -1933,7 +1931,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_fast_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1951,7 +1949,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_fast_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2003,7 +2001,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_fast_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX67-LABEL: s_fdiv_v4f32_arcp_math: ; GFX67: ; %bb.0: -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX67-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX67-NEXT: s_mov_b32 s11, 0xf000 @@ -2022,7 +2020,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX8-LABEL: s_fdiv_v4f32_arcp_math: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 @@ -2041,7 +2039,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: s_fdiv_v4f32_arcp_math: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -2059,7 +2057,7 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: s_fdiv_v4f32_arcp_math: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -2111,8 +2109,8 @@ define amdgpu_kernel void @s_fdiv_v4f32_arcp_math(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2134,11 +2132,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2152,14 +2150,13 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2181,11 +2178,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -2197,7 +2194,6 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2205,11 +2201,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; ; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -2221,7 +2217,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2229,11 +2225,11 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -2246,7 +2242,7 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2271,8 +2267,8 @@ entry: define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr addrspace(1) %out, float %a) #1 { ; GFX6-FASTFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) @@ -2292,11 +2288,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX6-SLOWFMA-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[2:3], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v2, v0 @@ -2308,14 +2304,13 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2335,11 +2330,11 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0 +; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0 ; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX8-NEXT: v_rcp_f32_e32 v2, v0 ; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0 ; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2 @@ -2349,7 +2344,6 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2357,21 +2351,21 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; ; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s4, 1.0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -2379,22 +2373,22 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0 +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index 8e43bd890a8fa4..c56b4ae3c34f5d 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -37,7 +37,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -52,7 +52,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -69,7 +69,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -102,7 +102,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -165,7 +165,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -185,7 +185,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -215,7 +215,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -235,7 +235,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -265,7 +265,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -286,7 +286,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg ; ; GCN-FLUSH-LABEL: div_v4_minus_1_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -369,7 +369,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 @@ -401,7 +401,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -433,7 +433,7 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v4, 0 @@ -468,40 +468,40 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { ; GCN-DENORM-LABEL: div_v_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0 ; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s4 ; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s4 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s2 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s0 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s0 ; GCN-DENORM-NEXT: v_sub_u32_e32 v2, v2, v4 ; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v1 ; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 -; GCN-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-DENORM-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v_by_x_25ulp: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN-FLUSH-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v2, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s2|, v0 +; GCN-FLUSH-NEXT: v_cmp_gt_f32_e64 vcc, |s0|, v0 ; GCN-FLUSH-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s2, v0 +; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s0, v0 ; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, s4, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-FLUSH-NEXT: global_store_dword v2, v0, s[2:3] ; GCN-FLUSH-NEXT: s_endpgm %load = load float, ptr addrspace(1) %arg, align 4 %div = fdiv float %num, %load, !fpmath !0 @@ -512,7 +512,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) { define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -529,7 +529,7 @@ define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -540,7 +540,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -558,7 +558,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-LABEL: div_1_by_minus_x_fast: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -576,7 +576,7 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -587,7 +587,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -606,7 +606,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -627,7 +627,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; ; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -656,7 +656,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -677,7 +677,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -706,7 +706,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -727,7 +727,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % ; ; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) @@ -757,7 +757,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) % define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) @@ -778,7 +778,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac ; ; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded: ; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index dbf78ae0e96ca4..1fb50959987df5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -7,10 +7,10 @@ declare void @extern_func() #0 define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, s3 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; FLAT_SCR_OPT-NEXT: s_mov_b64 s[0:1], src_private_base ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, s1 @@ -37,10 +37,10 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { define amdgpu_kernel void @stack_object_in_kernel_no_calls() { ; FLAT_SCR_OPT-LABEL: stack_object_in_kernel_no_calls: ; FLAT_SCR_OPT: ; %bb.0: -; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, s3 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 0 ; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 0 ; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s0 @@ -112,13 +112,13 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: test: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_load_dword vcc_lo, s[2:3], 0x8 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GCN-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane -; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-NEXT: v_writelane_b32 v0, s2, 0 +; GCN-NEXT: v_writelane_b32 v0, s3, 1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 087d38ce7b0046..0af57c6a97db5c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -14,19 +14,18 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -36,18 +35,17 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -57,13 +55,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -76,13 +73,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -95,9 +91,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -110,11 +106,10 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -142,19 +137,18 @@ bb: define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -164,8 +158,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -186,15 +179,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -206,15 +198,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -226,11 +217,9 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -243,12 +232,11 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -276,19 +264,18 @@ bb: define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -298,8 +285,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff1_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -320,15 +306,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff1_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -340,15 +325,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff1_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -360,11 +344,9 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff1_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS @@ -377,12 +359,11 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff1_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -410,20 +391,19 @@ bb: define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -433,19 +413,18 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -455,15 +434,13 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -476,15 +453,13 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -497,9 +472,9 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -514,12 +489,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -547,9 +522,8 @@ bb: define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -569,8 +543,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -592,17 +565,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -613,17 +585,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -635,13 +606,12 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -653,14 +623,12 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -688,9 +656,8 @@ bb: define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -710,8 +677,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff2_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -733,17 +699,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -754,17 +719,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff2_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -776,13 +740,12 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff2_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -794,14 +757,12 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff2_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -829,20 +790,19 @@ bb: define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -852,19 +812,18 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff1: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-GISEL-NEXT: v_add3_u32 v0, v1, s0, v0 ; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX940-GISEL-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX940-GISEL-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-GISEL-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-GISEL-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX940-GISEL-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 4 @@ -874,15 +833,13 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc @@ -895,15 +852,13 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc @@ -916,9 +871,9 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff1: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -933,12 +888,12 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -966,9 +921,8 @@ bb: define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 @@ -988,8 +942,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff2: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -1011,17 +964,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 4, v0 ; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc @@ -1032,17 +984,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1054,13 +1005,12 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff2: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1072,14 +1022,12 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 @@ -1107,18 +1055,17 @@ bb: define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb -; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-SDAG-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, 2 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 ; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v3, off offset:2 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -1128,8 +1075,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX940-GISEL-LABEL: soff4_voff4: ; GFX940-GISEL: ; %bb.0: ; %bb -; GFX940-GISEL-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-GISEL-NEXT: v_mov_b32_e32 v2, 1 @@ -1151,14 +1097,12 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0 @@ -1172,17 +1116,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX11-GISEL-LABEL: soff4_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 2, v0 ; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v4, v1, off dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1194,13 +1137,12 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-SDAG-LABEL: soff4_voff4: ; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, 0 ; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1212,14 +1154,12 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; ; GFX12-GISEL-LABEL: soff4_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_add3_u32 v0, 0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 14d8b71c5167a2..850be72f06c7d0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -13,13 +13,13 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -31,10 +31,10 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX10-LABEL: zero_init_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 ; GFX10-NEXT: s_mov_b32 s2, s0 @@ -83,18 +83,18 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX9-PAL-LABEL: zero_init_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 -; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 @@ -120,15 +120,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 @@ -145,15 +145,15 @@ define amdgpu_kernel void @zero_init_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 @@ -374,9 +374,9 @@ define void @zero_init_foo() { define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 @@ -392,11 +392,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 @@ -412,7 +412,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s1, s0, 15 @@ -428,7 +428,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_and_b32 s1, s0, 15 @@ -444,15 +444,15 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: s_add_i32 s1, s1, 0 @@ -466,7 +466,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 15 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s1, s0, 2 @@ -482,16 +482,16 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX10-PAL-LABEL: store_load_sindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -507,7 +507,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -523,7 +523,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PAL-NEXT: s_and_b32 s1, s0, 15 @@ -707,9 +707,9 @@ bb: define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: scratch_store_dword v1, v2, off @@ -721,10 +721,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0, v0 @@ -738,8 +738,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -750,8 +749,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -761,17 +759,17 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc @@ -781,7 +779,6 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX940-LABEL: store_load_vindex_kernel: ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -792,15 +789,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX10-PAL-LABEL: store_load_vindex_kernel: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX10-PAL-NEXT: s_mov_b32 s2, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 0, v0 @@ -814,8 +811,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -826,8 +822,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX12-PAL-LABEL: store_load_vindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -1068,8 +1063,8 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) { define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-LABEL: zero_init_small_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1088,10 +1083,10 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX10-LABEL: zero_init_small_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -1146,19 +1141,19 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_small_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -1187,15 +1182,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1214,15 +1209,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -1473,9 +1468,9 @@ define void @zero_init_small_offset_foo() { define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1493,11 +1488,11 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -1515,7 +1510,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -1533,7 +1528,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -1551,17 +1546,17 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1576,7 +1571,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_small_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -1594,16 +1589,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1622,16 +1617,16 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1649,7 +1644,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1667,7 +1662,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -1905,8 +1900,8 @@ bb: define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1922,10 +1917,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc @@ -1943,8 +1938,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1957,8 +1950,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1968,16 +1959,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x100, v0 @@ -1993,7 +1984,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -2004,15 +1994,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -2028,15 +2018,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off glc dlc @@ -2054,8 +2044,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2068,8 +2056,6 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -2260,8 +2246,8 @@ bb: define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-LABEL: zero_init_large_offset_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2281,10 +2267,10 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX10-LABEL: zero_init_large_offset_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, 0 @@ -2341,19 +2327,19 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX9-PAL-LABEL: zero_init_large_offset_kernel: ; GFX9-PAL: ; %bb.0: -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 -; GFX9-PAL-NEXT: s_mov_b32 s3, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_mov_b32 s1, s0 +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_mov_b32 s3, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 @@ -2384,15 +2370,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2412,15 +2398,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 @@ -2725,9 +2711,9 @@ define void @zero_init_large_offset_foo() { define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2745,11 +2731,11 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 @@ -2767,7 +2753,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-LABEL: store_load_sindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 15 @@ -2785,7 +2771,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-LABEL: store_load_sindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 15 @@ -2803,17 +2789,17 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc -; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2828,7 +2814,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX940-LABEL: store_load_sindex_large_offset_kernel: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, 15 @@ -2846,16 +2832,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1010-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2874,16 +2860,16 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX1030-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2901,7 +2887,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -2919,7 +2905,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; ; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 15 @@ -3157,8 +3143,8 @@ bb: define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3174,10 +3160,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3196,8 +3182,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3210,8 +3194,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3221,16 +3203,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 @@ -3246,7 +3228,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, 15 ; GFX940-NEXT: s_movk_i32 s0, 0x4004 ; GFX940-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 @@ -3258,15 +3239,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 @@ -3282,15 +3263,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc @@ -3309,8 +3290,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3323,8 +3302,6 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 @@ -3518,8 +3495,8 @@ bb: define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 @@ -3535,10 +3512,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_add_u32 s0, s0, s3 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 @@ -3576,15 +3553,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX9-PAL-NEXT: s_mov_b32 s2, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 @@ -3611,15 +3588,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1010-PAL: ; %bb.0: ; %bb -; GFX1010-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1010-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1010-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1010-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1010-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3635,15 +3612,15 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX1030-PAL: ; %bb.0: ; %bb -; GFX1030-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX1030-PAL-NEXT: s_mov_b32 s10, s0 -; GFX1030-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX1030-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX1030-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 @@ -3841,10 +3818,10 @@ bb: define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 @@ -3857,11 +3834,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_add_u32 s6, s6, s11 -; GFX10-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_add_u32 s2, s2, s5 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3874,11 +3851,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-LABEL: store_load_vidx_sidx_offset: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3888,10 +3864,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-LABEL: store_load_vidx_sidx_offset: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -3901,16 +3876,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX9-PAL: ; %bb.0: ; %bb -; GFX9-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX9-PAL-NEXT: s_mov_b32 s10, s0 -; GFX9-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX9-PAL-NEXT: s_mov_b32 s4, s0 +; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s10, s9 +; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 @@ -3921,8 +3896,7 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX940-LABEL: store_load_vidx_sidx_offset: ; GFX940: ; %bb.0: ; %bb -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 @@ -3936,16 +3910,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[10:11] -; GFX10-PAL-NEXT: s_mov_b32 s10, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] +; GFX10-PAL-NEXT: s_mov_b32 s4, s0 +; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s10, s10, s9 -; GFX10-PAL-NEXT: s_addc_u32 s11, s11, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; GFX10-PAL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 +; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 @@ -3958,11 +3932,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3972,10 +3945,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset: ; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 ; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index c9618d43943ef2..e44572985e6d2e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -21,14 +21,14 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -36,11 +36,11 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -55,14 +55,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -70,14 +70,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -85,11 +85,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -104,14 +104,14 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -119,14 +119,14 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -134,11 +134,11 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -155,8 +155,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -173,8 +173,8 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -191,12 +191,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -214,18 +214,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -233,18 +233,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_add_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -252,11 +252,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_add_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -276,18 +276,18 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -298,18 +298,18 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -320,11 +320,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -348,12 +348,12 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_add_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -361,12 +361,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -374,11 +374,11 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -392,8 +392,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_add_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -408,8 +408,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_add_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -424,12 +424,12 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_add_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -446,16 +446,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_add v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -463,16 +463,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_add v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -480,11 +480,11 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_add_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -503,16 +503,16 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -523,16 +523,16 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_add_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -543,11 +543,11 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_add_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -570,14 +570,14 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -585,14 +585,14 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -600,11 +600,11 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -619,8 +619,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -637,8 +637,8 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -655,12 +655,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -678,18 +678,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -697,18 +697,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_and_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -716,11 +716,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_and_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -740,18 +740,18 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -762,18 +762,18 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -784,11 +784,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -812,12 +812,12 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_and_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -825,12 +825,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -838,11 +838,11 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -856,8 +856,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_and_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -872,8 +872,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_and_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -888,12 +888,12 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_and_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -910,16 +910,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -927,16 +927,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -944,11 +944,11 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_and_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -967,16 +967,16 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -987,16 +987,16 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_and_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1007,11 +1007,11 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_and_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1034,14 +1034,14 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1049,14 +1049,14 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1064,11 +1064,11 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1083,8 +1083,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1101,8 +1101,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1119,12 +1119,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1142,18 +1142,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1161,18 +1161,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1180,11 +1180,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_sub_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1204,18 +1204,18 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1226,18 +1226,18 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1248,11 +1248,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1276,12 +1276,12 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_sub_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1289,12 +1289,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1302,11 +1302,11 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1320,8 +1320,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1352,12 +1352,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -1374,16 +1374,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1391,16 +1391,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1408,11 +1408,11 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_sub_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1431,16 +1431,16 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1451,16 +1451,16 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_sub_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1471,11 +1471,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_sub_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1498,39 +1498,39 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -1562,8 +1562,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -1580,12 +1580,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1603,47 +1603,47 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1662,18 +1662,18 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1684,18 +1684,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1706,11 +1706,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1734,35 +1734,35 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_max_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1775,8 +1775,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_max_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_max_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1807,12 +1807,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_max_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -1829,43 +1829,43 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1883,16 +1883,16 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -1903,16 +1903,16 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -1923,11 +1923,11 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -1950,39 +1950,39 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -1996,8 +1996,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2014,8 +2014,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2032,12 +2032,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2055,47 +2055,47 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2114,18 +2114,18 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2136,18 +2136,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2158,11 +2158,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2186,35 +2186,35 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umax_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2227,8 +2227,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umax_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2243,8 +2243,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umax_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2259,12 +2259,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umax_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2281,43 +2281,43 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umax_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2335,16 +2335,16 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2355,16 +2355,16 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2375,11 +2375,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2402,39 +2402,39 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2448,8 +2448,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2466,8 +2466,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2484,12 +2484,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2507,47 +2507,47 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2566,18 +2566,18 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2588,18 +2588,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2610,11 +2610,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,35 +2638,35 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2679,8 +2679,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_min_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2695,8 +2695,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2711,12 +2711,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2733,43 +2733,43 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_min_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2787,16 +2787,16 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -2807,16 +2807,16 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -2827,11 +2827,11 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -2854,39 +2854,39 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -2900,8 +2900,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -2918,8 +2918,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -2936,12 +2936,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -2959,47 +2959,47 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3018,18 +3018,18 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3040,18 +3040,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3062,11 +3062,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3090,35 +3090,35 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_umin_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) @@ -3131,8 +3131,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_umin_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3147,8 +3147,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_umin_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3163,12 +3163,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_umin_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3185,43 +3185,43 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_umin_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3239,16 +3239,16 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 @@ -3259,16 +3259,16 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umin_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 @@ -3279,11 +3279,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umin_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3306,14 +3306,14 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3321,14 +3321,14 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3336,11 +3336,11 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3355,8 +3355,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3373,8 +3373,8 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3391,12 +3391,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3414,18 +3414,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3433,18 +3433,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3452,11 +3452,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; ; GCN3-LABEL: atomic_or_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3476,18 +3476,18 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3498,18 +3498,18 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3520,11 +3520,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3548,12 +3548,12 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_or_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3561,12 +3561,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3574,11 +3574,11 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3592,8 +3592,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_or_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3608,8 +3608,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_or_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3624,12 +3624,12 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_or_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3646,16 +3646,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3663,16 +3663,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3680,11 +3680,11 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; ; GCN3-LABEL: atomic_or_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3703,16 +3703,16 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3723,16 +3723,16 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN2-LABEL: atomic_or_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3743,11 +3743,11 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; ; GCN3-LABEL: atomic_or_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3770,14 +3770,14 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3785,14 +3785,14 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3800,11 +3800,11 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3819,14 +3819,14 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN1-LABEL: atomic_xchg_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3834,14 +3834,14 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN2-LABEL: atomic_xchg_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3849,11 +3849,11 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; ; GCN3-LABEL: atomic_xchg_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3868,8 +3868,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -3886,8 +3886,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -3904,12 +3904,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3927,18 +3927,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3946,18 +3946,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3965,11 +3965,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; ; GCN3-LABEL: atomic_xchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -3989,18 +3989,18 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4011,18 +4011,18 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4033,11 +4033,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4061,12 +4061,12 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4074,12 +4074,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4087,11 +4087,11 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4105,8 +4105,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4121,8 +4121,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4137,12 +4137,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4159,16 +4159,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4176,16 +4176,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4193,11 +4193,11 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4216,16 +4216,16 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4236,16 +4236,16 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_xchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4256,11 +4256,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_xchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4285,7 +4285,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN2-LABEL: atomic_cmpxchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4315,7 +4315,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 @@ -4334,8 +4334,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 16 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -4353,8 +4353,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 16 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -4372,13 +4372,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4397,19 +4397,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4418,19 +4418,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4439,12 +4439,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4465,19 +4465,19 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4489,19 +4489,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4513,12 +4513,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4544,7 +4544,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 @@ -4557,7 +4557,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 @@ -4570,7 +4570,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v2, s2 @@ -4588,8 +4588,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 @@ -4605,8 +4605,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 @@ -4622,13 +4622,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4646,17 +4646,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4665,17 +4665,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x3c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4684,12 +4684,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; ; GCN3-LABEL: atomic_cmpxchg_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s7, s[2:3], 0x3c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4709,17 +4709,17 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) { ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dword s8, s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4731,17 +4731,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4753,12 +4753,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s9, s[2:3], 0x44 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 @@ -4783,14 +4783,14 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4798,14 +4798,14 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4813,11 +4813,11 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4832,8 +4832,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -4850,8 +4850,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -4868,12 +4868,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4891,18 +4891,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4910,18 +4910,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4929,11 +4929,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_xor_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -4953,18 +4953,18 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4975,18 +4975,18 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4997,11 +4997,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5025,12 +5025,12 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_xor_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5038,12 +5038,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5051,11 +5051,11 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5069,8 +5069,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -5085,8 +5085,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -5101,12 +5101,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -5123,16 +5123,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5140,16 +5140,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5157,11 +5157,11 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_xor_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5180,16 +5180,16 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5200,16 +5200,16 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_xor_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5220,11 +5220,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_xor_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5247,7 +5247,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5300,7 +5300,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5314,7 +5314,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5328,7 +5328,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5348,8 +5348,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5368,8 +5368,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5388,10 +5388,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5414,8 +5414,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5432,8 +5432,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5450,10 +5450,10 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5475,37 +5475,37 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5518,33 +5518,33 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5556,8 +5556,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5572,8 +5572,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5588,15 +5588,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -5609,8 +5609,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5623,8 +5623,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5637,15 +5637,15 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; ; GCN3-LABEL: atomic_store_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -5657,7 +5657,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5673,7 +5673,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5710,7 +5710,7 @@ entry: define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5724,7 +5724,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5738,7 +5738,7 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -5758,8 +5758,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5778,8 +5778,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5798,10 +5798,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5824,8 +5824,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5842,8 +5842,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5860,10 +5860,10 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN3-LABEL: atomic_load_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -5885,37 +5885,37 @@ entry: define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -5928,33 +5928,33 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) { ; GCN1-LABEL: atomic_store_f32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_dword v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_dword v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -5966,8 +5966,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -5982,8 +5982,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN2-LABEL: atomic_store_f32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -5998,15 +5998,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; ; GCN3-LABEL: atomic_store_f32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6019,8 +6019,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6033,8 +6033,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN2-LABEL: atomic_store_f32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6047,15 +6047,15 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; ; GCN3-LABEL: atomic_store_f32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: @@ -6067,7 +6067,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6083,7 +6083,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6099,7 +6099,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6120,7 +6120,7 @@ entry: define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6134,7 +6134,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6148,7 +6148,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6168,8 +6168,8 @@ entry: define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 @@ -6187,8 +6187,8 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_load_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 @@ -6206,11 +6206,11 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_load_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s0, s4, s0 -; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: s_add_u32 s0, s4, s2 +; GCN3-NEXT: s_addc_u32 s1, s5, s3 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc @@ -6231,37 +6231,37 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6274,33 +6274,33 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i8: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_byte v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i8: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_byte v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i8: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6312,8 +6312,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i8_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, s6 ; GCN1-NEXT: s_addc_u32 s1, s5, s7 @@ -6327,8 +6327,8 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN2-LABEL: atomic_store_i8_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, s6 ; GCN2-NEXT: s_addc_u32 s1, s5, s7 @@ -6342,14 +6342,14 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; ; GCN3-LABEL: atomic_store_i8_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s0, s4, s6 ; GCN3-NEXT: s_addc_u32 s1, s5, s7 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6362,7 +6362,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6378,7 +6378,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6415,7 +6415,7 @@ entry: define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6429,7 +6429,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -6463,8 +6463,8 @@ entry: define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -6483,8 +6483,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -6503,10 +6503,10 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_load_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -6529,37 +6529,37 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6572,33 +6572,33 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_i16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_i16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6610,8 +6610,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i16_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN1-NEXT: s_add_u32 s0, s4, s0 @@ -6626,8 +6626,8 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i16_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN2-NEXT: s_add_u32 s0, s4, s0 @@ -6642,15 +6642,15 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; ; GCN3-LABEL: atomic_store_i16_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v2, s8 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: @@ -6663,37 +6663,37 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm @@ -6706,33 +6706,33 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GCN1-LABEL: atomic_store_f16: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_f16: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_f16: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6744,33 +6744,33 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6782,33 +6782,33 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GCN1-LABEL: atomic_store_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb -; GCN1-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_store_short v[0:1], v2 ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_store_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_store_short v[0:1], v2 ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_store_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm @@ -6819,14 +6819,14 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6834,14 +6834,14 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6849,11 +6849,11 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6868,14 +6868,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6883,14 +6883,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6898,11 +6898,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6917,14 +6917,14 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6932,14 +6932,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6947,11 +6947,11 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -6968,8 +6968,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -6986,8 +6986,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7004,12 +7004,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7027,18 +7027,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7046,18 +7046,18 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i32_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7065,11 +7065,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_inc_i32_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7089,18 +7089,18 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7111,18 +7111,18 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7133,11 +7133,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7161,12 +7161,12 @@ entry: define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_inc_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7174,12 +7174,12 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7187,11 +7187,11 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7205,8 +7205,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_inc_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7221,8 +7221,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_inc_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7237,12 +7237,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_inc_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7259,16 +7259,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7276,16 +7276,16 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i32_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7293,11 +7293,11 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_inc_i32_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7316,16 +7316,16 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i32_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7336,16 +7336,16 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_inc_i32_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7356,11 +7356,11 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_inc_i32_ret_incr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7383,14 +7383,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 16 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 16 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7398,14 +7398,14 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 16 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7413,11 +7413,11 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7432,14 +7432,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0xffc -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0xffc +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7447,14 +7447,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0xffc -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0xffc +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7462,11 +7462,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7481,14 +7481,14 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_max_offset_p1: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7496,14 +7496,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_max_offset_p1: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 0x1000 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s2, 0x1000 +; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7511,11 +7511,11 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_max_offset_p1: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_mov_b32_e32 v2, s4 @@ -7532,8 +7532,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s4, 16 ; GCN1-NEXT: s_addc_u32 s1, s5, 0 @@ -7550,8 +7550,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s4, 16 ; GCN2-NEXT: s_addc_u32 s1, s5, 0 @@ -7568,12 +7568,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7591,18 +7591,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7610,18 +7610,18 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i32_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7629,11 +7629,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; ; GCN3-LABEL: atomic_dec_i32_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7653,18 +7653,18 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7675,18 +7675,18 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7697,11 +7697,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7725,12 +7725,12 @@ entry: define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_dec_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7738,12 +7738,12 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7751,11 +7751,11 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s2 +; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7769,8 +7769,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; GCN1-LABEL: atomic_dec_i32_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -7785,8 +7785,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN2-LABEL: atomic_dec_i32_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -7801,12 +7801,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) { ; ; GCN3-LABEL: atomic_dec_i32_ret: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s0 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7823,16 +7823,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7840,16 +7840,16 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i32_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7857,11 +7857,11 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; ; GCN3-LABEL: atomic_dec_i32_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7880,16 +7880,16 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i32_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -7900,16 +7900,16 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_dec_i32_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -7920,11 +7920,11 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_dec_i32_ret_decr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s8, s[2:3], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -7947,7 +7947,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -7963,7 +7963,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -7979,7 +7979,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8013,7 +8013,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8027,7 +8027,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_f16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8046,7 +8046,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16_offset: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -8062,7 +8062,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16_offset: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -8078,7 +8078,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16_offset: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 @@ -8098,7 +8098,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_bf16: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -8112,7 +8112,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_bf16: ; GCN2: ; %bb.0: -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -8126,7 +8126,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; ; GCN3-LABEL: atomic_load_bf16: ; GCN3: ; %bb.0: -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 4d80e9124f41f9..5bd527149572e5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3823,7 +3823,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -3883,7 +3883,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -3918,8 +3918,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -3953,8 +3953,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -3988,32 +3988,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4029,7 +4029,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4057,7 +4057,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -4085,7 +4085,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -4119,8 +4119,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_max_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -4152,8 +4152,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_max_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -4185,32 +4185,32 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_max_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -4966,7 +4966,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -4996,7 +4996,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -5026,7 +5026,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -5061,8 +5061,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5096,8 +5096,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5131,32 +5131,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -5172,8 +5172,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_umax_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -5205,8 +5205,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN2-LABEL: atomic_umax_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -5238,32 +5238,32 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; ; GCN3-LABEL: atomic_umax_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s0, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6760,7 +6760,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s5, s3, 31 ; GCN1-NEXT: s_mov_b32 s4, s3 @@ -6790,7 +6790,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s5, s3, 31 ; GCN2-NEXT: s_mov_b32 s4, s3 @@ -6820,7 +6820,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_ashr_i32 s5, s3, 31 ; GCN3-NEXT: s_mov_b32 s4, s3 @@ -6855,8 +6855,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -6890,8 +6890,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -6925,32 +6925,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 @@ -6966,8 +6966,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-LABEL: atomic_min_i32: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN1-NEXT: s_load_dword s2, s[2:3], 0xb +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN1-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 @@ -6990,8 +6990,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN2-LABEL: atomic_min_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GCN2-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 @@ -7014,17 +7014,17 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; ; GCN3-LABEL: atomic_min_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 -; GCN3-NEXT: s_load_dword s4, s[2:3], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN3-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s4, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -7043,8 +7043,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) { ; GCN1-LABEL: atomic_min_i32_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_ashr_i32 s7, s5, 31 ; GCN1-NEXT: s_mov_b32 s6, s5 @@ -7076,8 +7076,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN2-LABEL: atomic_min_i32_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_ashr_i32 s7, s5, 31 ; GCN2-NEXT: s_mov_b32 s6, s5 @@ -7109,32 +7109,32 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; ; GCN3-LABEL: atomic_min_i32_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s3, s1, 31 -; GCN3-NEXT: s_mov_b32 s2, s1 -; GCN3-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GCN3-NEXT: s_add_u32 s2, s4, s2 -; GCN3-NEXT: s_addc_u32 s3, s5, s3 -; GCN3-NEXT: v_mov_b32_e32 v0, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s3 +; GCN3-NEXT: s_ashr_i32 s1, s3, 31 +; GCN3-NEXT: s_mov_b32 s0, s3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] -; GCN3-NEXT: s_mov_b64 s[2:3], 0 +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_min_i32_e32 v2, s0, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 ; GCN3-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 86e6224d2f8d56..0de1be1dbb3511 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -21,7 +21,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -36,7 +36,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -54,8 +54,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -73,8 +73,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -93,8 +93,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -115,8 +115,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -134,8 +134,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_add_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -154,8 +154,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -177,7 +177,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -198,7 +198,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -219,7 +219,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -244,7 +244,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -257,7 +257,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -270,7 +270,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -287,8 +287,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -304,8 +304,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -322,8 +322,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -343,8 +343,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -360,8 +360,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_add_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -378,8 +378,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -400,7 +400,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -419,7 +419,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_add_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -438,7 +438,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -462,7 +462,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -477,7 +477,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -492,7 +492,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -510,8 +510,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -529,8 +529,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -549,8 +549,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -571,8 +571,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -590,8 +590,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_and_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -610,8 +610,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -633,7 +633,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -654,7 +654,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -675,7 +675,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -700,7 +700,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -713,7 +713,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -726,7 +726,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -743,8 +743,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -760,8 +760,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -778,8 +778,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -799,8 +799,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -816,8 +816,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_and_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -834,8 +834,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -856,7 +856,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -875,7 +875,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_and_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -894,7 +894,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -918,7 +918,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -948,7 +948,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -966,8 +966,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -985,8 +985,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -1005,8 +1005,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1027,8 +1027,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1046,8 +1046,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_sub_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1066,8 +1066,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1089,7 +1089,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1110,7 +1110,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1156,7 +1156,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1199,8 +1199,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1216,8 +1216,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1255,8 +1255,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1272,8 +1272,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_sub_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1290,8 +1290,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1312,7 +1312,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1331,7 +1331,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1374,7 +1374,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1388,7 +1388,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1402,7 +1402,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1420,8 +1420,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -1439,8 +1439,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -1459,8 +1459,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1481,8 +1481,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1499,8 +1499,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1518,8 +1518,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1541,7 +1541,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1562,7 +1562,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1608,7 +1608,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -1620,7 +1620,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -1632,7 +1632,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1649,8 +1649,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1666,8 +1666,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1684,8 +1684,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1705,8 +1705,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1721,8 +1721,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1738,8 +1738,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1760,7 +1760,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -1822,7 +1822,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -1836,7 +1836,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -1850,7 +1850,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -1868,8 +1868,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -1887,8 +1887,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -1907,8 +1907,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -1929,8 +1929,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1947,8 +1947,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1966,8 +1966,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -1989,7 +1989,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2010,7 +2010,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2031,7 +2031,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2056,7 +2056,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2068,7 +2068,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2080,7 +2080,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2097,8 +2097,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2114,8 +2114,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2132,8 +2132,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2153,8 +2153,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2169,8 +2169,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_umax_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2186,8 +2186,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2208,7 +2208,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2246,7 +2246,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2270,7 +2270,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2284,7 +2284,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2298,7 +2298,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2316,8 +2316,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -2335,8 +2335,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -2355,8 +2355,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2377,8 +2377,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2395,8 +2395,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2414,8 +2414,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2437,7 +2437,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2458,7 +2458,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2479,7 +2479,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2504,7 +2504,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2516,7 +2516,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2528,7 +2528,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2545,8 +2545,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2562,8 +2562,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2580,8 +2580,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2601,8 +2601,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2617,8 +2617,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_min_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2634,8 +2634,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2656,7 +2656,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2675,7 +2675,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2694,7 +2694,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2718,7 +2718,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -2732,7 +2732,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2764,8 +2764,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -2783,8 +2783,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -2803,8 +2803,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -2825,8 +2825,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2843,8 +2843,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umin_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2862,8 +2862,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2885,7 +2885,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -2906,7 +2906,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -2927,7 +2927,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -2952,7 +2952,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -2964,7 +2964,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -2976,7 +2976,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -2993,8 +2993,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3010,8 +3010,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3028,8 +3028,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3049,8 +3049,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3065,8 +3065,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_umin_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3082,8 +3082,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3104,7 +3104,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3123,7 +3123,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3142,7 +3142,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3166,7 +3166,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3181,7 +3181,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3196,7 +3196,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3214,8 +3214,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -3233,8 +3233,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; ; GCN2-LABEL: atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -3253,8 +3253,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3275,8 +3275,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3294,8 +3294,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; ; GCN2-LABEL: atomic_or_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3314,8 +3314,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3337,7 +3337,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3358,7 +3358,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3379,7 +3379,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3404,7 +3404,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3417,7 +3417,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3430,7 +3430,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3447,8 +3447,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3464,8 +3464,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3482,8 +3482,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3503,8 +3503,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3520,8 +3520,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; ; GCN2-LABEL: atomic_or_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3538,8 +3538,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3560,7 +3560,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3579,7 +3579,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GCN2-LABEL: atomic_or_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3598,7 +3598,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3622,7 +3622,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3637,7 +3637,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3652,7 +3652,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3670,7 +3670,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-LABEL: atomic_xchg_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3685,7 +3685,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GCN2-LABEL: atomic_xchg_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3700,7 +3700,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3718,7 +3718,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-LABEL: atomic_xchg_pointer_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -3733,7 +3733,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GCN2-LABEL: atomic_xchg_pointer_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -3748,7 +3748,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3766,8 +3766,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -3785,8 +3785,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -3805,8 +3805,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -3827,8 +3827,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3846,8 +3846,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_xchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3866,8 +3866,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3889,7 +3889,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -3910,7 +3910,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -3931,7 +3931,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -3956,7 +3956,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -3969,7 +3969,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -3982,7 +3982,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -3999,8 +3999,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4016,8 +4016,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4034,8 +4034,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -4055,8 +4055,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4072,8 +4072,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_xchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4090,8 +4090,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4112,7 +4112,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4131,7 +4131,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4150,7 +4150,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4174,7 +4174,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4189,7 +4189,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4204,7 +4204,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4222,8 +4222,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -4241,8 +4241,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -4261,8 +4261,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -4283,8 +4283,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4302,8 +4302,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_xor_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4322,8 +4322,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4345,7 +4345,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4366,7 +4366,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4387,7 +4387,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4412,7 +4412,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4425,7 +4425,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4438,7 +4438,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4455,8 +4455,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4472,8 +4472,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4490,8 +4490,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -4511,8 +4511,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4528,8 +4528,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_xor_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4546,8 +4546,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4568,7 +4568,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -4587,7 +4587,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -4606,7 +4606,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -4630,7 +4630,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -4646,7 +4646,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -4662,7 +4662,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4681,7 +4681,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4695,7 +4695,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4709,7 +4709,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4727,8 +4727,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4747,8 +4747,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4768,8 +4768,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4792,8 +4792,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4810,8 +4810,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4829,8 +4829,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4852,7 +4852,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -4865,7 +4865,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -4878,7 +4878,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4894,7 +4894,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; GCN1-LABEL: atomic_store_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -4905,7 +4905,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -4916,7 +4916,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -4931,8 +4931,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4948,8 +4948,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_store_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4966,8 +4966,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -4987,8 +4987,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5002,8 +5002,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; ; GCN2-LABEL: atomic_store_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5018,8 +5018,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -5038,8 +5038,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 32 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -5056,8 +5056,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; ; GCN2-LABEL: atomic_cmpxchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 32 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -5075,8 +5075,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5095,8 +5095,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_soffset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 @@ -5113,8 +5113,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; ; GCN2-LABEL: atomic_cmpxchg_i64_soffset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 @@ -5132,8 +5132,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5152,7 +5152,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5172,7 +5172,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5192,7 +5192,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5215,7 +5215,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5235,7 +5235,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5255,7 +5255,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5277,8 +5277,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 @@ -5301,8 +5301,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 @@ -5326,8 +5326,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 @@ -5353,8 +5353,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 @@ -5369,8 +5369,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 @@ -5386,8 +5386,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5405,7 +5405,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: v_mov_b32_e32 v5, s1 @@ -5423,7 +5423,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 @@ -5441,7 +5441,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -5463,7 +5463,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5481,7 +5481,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5499,7 +5499,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5520,8 +5520,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: s_add_u32 s2, s4, s2 @@ -5542,8 +5542,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: s_add_u32 s2, s4, s2 @@ -5565,8 +5565,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 @@ -5591,7 +5591,7 @@ entry: define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -5607,7 +5607,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -5623,7 +5623,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5642,7 +5642,7 @@ entry: define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GCN1-LABEL: atomic_load_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5656,7 +5656,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GCN2-LABEL: atomic_load_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5670,7 +5670,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; ; GFX12-LABEL: atomic_load_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5688,8 +5688,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5708,8 +5708,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; ; GCN2-LABEL: atomic_load_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5729,8 +5729,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5753,8 +5753,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_load_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5771,8 +5771,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; ; GCN2-LABEL: atomic_load_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5790,8 +5790,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-LABEL: atomic_load_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5813,7 +5813,7 @@ entry: define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: s_add_u32 s0, s2, 32 @@ -5826,7 +5826,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: s_add_u32 s0, s2, 32 @@ -5839,7 +5839,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5855,7 +5855,7 @@ entry: define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GCN1-LABEL: atomic_store_f64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GCN2-LABEL: atomic_store_f64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -5877,7 +5877,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; ; GFX12-LABEL: atomic_store_f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -5892,8 +5892,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5909,8 +5909,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; ; GCN2-LABEL: atomic_store_f64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5927,8 +5927,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -5948,8 +5948,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) { ; GCN1-LABEL: atomic_store_f64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5963,8 +5963,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; ; GCN2-LABEL: atomic_store_f64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5979,8 +5979,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-LABEL: atomic_store_f64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -5999,7 +5999,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6014,7 +6014,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6029,7 +6029,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6047,8 +6047,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -6066,8 +6066,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -6086,8 +6086,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6108,8 +6108,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6127,8 +6127,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_inc_i64_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6147,8 +6147,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6170,7 +6170,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6191,7 +6191,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6212,7 +6212,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6237,7 +6237,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6250,7 +6250,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6263,7 +6263,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6280,8 +6280,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6297,8 +6297,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6315,8 +6315,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_inc_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6336,8 +6336,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6353,8 +6353,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_inc_i64_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6371,8 +6371,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_inc_i64_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6393,7 +6393,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6412,7 +6412,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6431,7 +6431,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6455,7 +6455,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 @@ -6470,7 +6470,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 @@ -6485,7 +6485,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6503,8 +6503,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: s_add_u32 s0, s0, 32 @@ -6522,8 +6522,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: s_add_u32 s0, s0, 32 @@ -6542,8 +6542,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6564,8 +6564,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6583,8 +6583,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_dec_i64_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6603,8 +6603,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6626,7 +6626,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6647,7 +6647,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6668,7 +6668,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -6693,7 +6693,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 @@ -6706,7 +6706,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 @@ -6719,7 +6719,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -6736,8 +6736,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6753,8 +6753,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6771,8 +6771,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-LABEL: atomic_dec_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -6792,8 +6792,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6809,8 +6809,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_dec_i64_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -6827,8 +6827,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_dec_i64_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6849,7 +6849,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 @@ -6868,7 +6868,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 @@ -6887,7 +6887,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 7e4a36b7dc11b4..d812b4b7d86e6c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -4258,8 +4258,8 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4292,8 +4292,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4326,10 +4326,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN3-LABEL: atomic_max_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -4365,7 +4365,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -4402,7 +4402,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -4439,7 +4439,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -4482,8 +4482,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -4514,8 +4514,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -4546,10 +4546,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; ; GCN3-LABEL: atomic_max_i64_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -4584,7 +4584,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -4654,7 +4654,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_max_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -5640,8 +5640,8 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -5674,8 +5674,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -5708,10 +5708,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; ; GCN3-LABEL: atomic_umax_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -5747,7 +5747,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -5784,7 +5784,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -5821,7 +5821,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -5864,7 +5864,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -5899,7 +5899,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -5934,7 +5934,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; ; GCN3-LABEL: atomic_umax_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -7864,8 +7864,8 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s4 @@ -7898,8 +7898,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s4 @@ -7932,10 +7932,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; ; GCN3-LABEL: atomic_min_i64_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GCN3-NEXT: s_add_u32 s0, s4, s0 ; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v5, s1 @@ -7971,7 +7971,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -8008,7 +8008,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -8045,7 +8045,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 @@ -8088,7 +8088,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 @@ -8118,7 +8118,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -8148,7 +8148,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; ; GCN3-LABEL: atomic_min_i64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 @@ -8183,7 +8183,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN1-NEXT: s_add_u32 s0, s0, s6 @@ -8218,7 +8218,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN2-NEXT: s_add_u32 s0, s0, s6 @@ -8253,7 +8253,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; ; GCN3-LABEL: atomic_min_i64_ret_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GCN3-NEXT: s_add_u32 s0, s0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index 4846e21fe836eb..bac2d8b8b40c26 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -21,7 +21,7 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -41,9 +41,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -77,7 +75,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -103,9 +101,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -150,7 +146,7 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_f64_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -170,9 +166,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -206,7 +200,7 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -226,9 +220,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -262,7 +254,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_f64_0_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -288,9 +280,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -335,7 +325,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -355,9 +345,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -391,7 +379,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_1_f64_2use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -417,9 +405,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -464,7 +450,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -484,9 +470,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc @@ -522,7 +506,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -548,9 +532,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -597,7 +579,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -623,9 +605,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc @@ -672,7 +652,7 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -698,7 +678,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -723,9 +703,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -749,9 +727,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -798,7 +774,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { ; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s6, 0 ; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -824,7 +800,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s6, 0 ; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -849,9 +825,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -875,9 +849,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc @@ -927,56 +899,56 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -994,8 +966,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1020,56 +992,56 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1087,8 +1059,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1113,55 +1085,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1180,8 +1152,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1206,55 +1178,55 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1273,8 +1245,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1299,55 +1271,55 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1366,8 +1338,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1392,55 +1364,55 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1459,8 +1431,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1485,55 +1457,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1552,8 +1524,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1578,55 +1550,55 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1645,8 +1617,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1671,55 +1643,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1738,8 +1710,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1764,55 +1736,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1831,8 +1803,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1857,55 +1829,55 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -1924,8 +1896,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1950,55 +1922,55 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s10, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s10 +; SI-NOFMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NOFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-NOFMA-NEXT: s_mov_b32 s2, -1 +; SI-NOFMA-NEXT: s_mov_b32 s14, s2 ; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s12, s6 ; SI-NOFMA-NEXT: s_mov_b32 s13, s7 -; SI-NOFMA-NEXT: s_mov_b32 s15, s11 -; SI-NOFMA-NEXT: s_mov_b32 s2, s10 -; SI-NOFMA-NEXT: s_mov_b32 s3, s11 +; SI-NOFMA-NEXT: s_mov_b32 s15, s3 +; SI-NOFMA-NEXT: s_mov_b32 s10, s2 +; SI-NOFMA-NEXT: s_mov_b32 s11, s3 ; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NOFMA-NEXT: s_mov_b32 s8, s4 -; SI-NOFMA-NEXT: s_mov_b32 s9, s5 +; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NOFMA-NEXT: s_mov_b32 s0, s4 +; SI-NOFMA-NEXT: s_mov_b32 s1, s5 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) ; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NOFMA-NEXT: s_endpgm ; ; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-FMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-FMA-NEXT: s_mov_b32 s2, -1 +; SI-FMA-NEXT: s_mov_b32 s14, s2 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s12, s6 ; SI-FMA-NEXT: s_mov_b32 s13, s7 -; SI-FMA-NEXT: s_mov_b32 s15, s11 -; SI-FMA-NEXT: s_mov_b32 s2, s10 -; SI-FMA-NEXT: s_mov_b32 s3, s11 +; SI-FMA-NEXT: s_mov_b32 s15, s3 +; SI-FMA-NEXT: s_mov_b32 s10, s2 +; SI-FMA-NEXT: s_mov_b32 s11, s3 ; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-FMA-NEXT: s_mov_b32 s8, s4 -; SI-FMA-NEXT: s_mov_b32 s9, s5 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-FMA-NEXT: s_mov_b32 s0, s4 +; SI-FMA-NEXT: s_mov_b32 s1, s5 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) ; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-FMA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-FMA-NEXT: s_endpgm ; ; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-NOFMA: ; %bb.0: ; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x1 @@ -2017,8 +1989,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -2047,7 +2019,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-NOFMA-LABEL: test_f32_interp: ; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NOFMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NOFMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-NOFMA-NEXT: s_mov_b32 s10, -1 ; SI-NOFMA-NEXT: s_mov_b32 s14, s10 @@ -2079,7 +2051,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; SI-FMA-LABEL: test_f32_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2109,7 +2081,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f32_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2130,7 +2102,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f32_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2163,7 +2135,7 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-LABEL: test_f64_interp: ; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 ; SI-FMA-NEXT: s_mov_b32 s18, s10 @@ -2193,7 +2165,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-NOFMA-LABEL: test_f64_interp: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NOFMA-NEXT: s_clause 0x2 @@ -2214,7 +2186,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; ; GFX11-FMA-LABEL: test_f64_interp: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x2 @@ -2248,7 +2220,7 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_neg_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2264,9 +2236,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2296,7 +2266,7 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: fma_2.0_neg_a_b_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2312,9 +2282,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: fma_2.0_neg_a_b_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -2344,7 +2312,7 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 { ; SI-LABEL: fma_neg_b_c_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v12, 4, v0 @@ -2365,9 +2333,7 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: fma_neg_b_c_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll index 39a9a85081af59..93ed64d93b8ba4 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.ll @@ -159,15 +159,15 @@ define float @fold_fmul_distributive(float %x, float %y) { define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) { ; GFX906-LABEL: vec_mul_scalar_add_fma: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v1, s8 ; GFX906-NEXT: v_mov_b32_e32 v2, s6 ; GFX906-NEXT: v_fmac_f32_e32 v1, s4, v2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] offset:4 ; GFX906-NEXT: s_endpgm %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1 %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 84852c2632f671..23eb73038917d2 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -97,7 +97,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -169,7 +169,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -199,7 +199,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -270,7 +270,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -368,7 +368,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -410,7 +410,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmax3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -444,7 +444,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmax3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -478,7 +478,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmax3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -508,7 +508,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmax3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll index 018399983a863d..01b2f207388e8a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -111,7 +111,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -132,7 +132,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -163,7 +163,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -184,7 +184,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmax_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 3b7009023b03af..87ac95a1cd7390 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -262,8 +262,8 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS @@ -286,8 +286,8 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fmaximum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 84099e472d65fd..764fb992d4d34c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -45,7 +45,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -63,7 +63,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -83,7 +83,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -95,14 +95,13 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -123,7 +122,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -140,7 +139,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -157,7 +156,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -175,7 +174,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -195,7 +194,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -207,14 +206,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -236,7 +234,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -253,7 +251,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -270,7 +268,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -288,7 +286,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -308,7 +306,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -320,14 +318,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -349,7 +346,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -366,7 +363,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -383,7 +380,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -401,7 +398,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -421,7 +418,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -433,14 +430,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -462,7 +458,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -480,7 +476,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -498,7 +494,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -517,7 +513,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -538,7 +534,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -551,14 +547,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; ; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -580,7 +575,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -601,7 +596,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -622,7 +617,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -644,7 +639,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -668,7 +663,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -684,7 +679,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -700,16 +695,14 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1 ; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -721,14 +714,13 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v2, v1, 2.0, 4.0 ; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 ; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc @@ -755,7 +747,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -773,7 +765,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -791,7 +783,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -810,7 +802,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -831,7 +823,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_fmed3_r_i_i_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -844,16 +836,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_fmed3_r_i_i_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -875,7 +865,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -891,7 +881,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -907,7 +897,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -924,7 +914,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -943,7 +933,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -954,9 +944,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -981,7 +969,7 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -998,7 +986,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1016,7 +1004,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1034,7 +1022,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1057,7 +1045,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1069,7 +1057,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1084,14 +1072,13 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1100,17 +1087,15 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 2.0, v1 ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1 ; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1138,7 +1123,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1162,7 +1147,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1185,7 +1170,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1212,7 +1197,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1244,7 +1229,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1259,7 +1244,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1275,9 +1260,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1294,9 +1277,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1332,7 +1313,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1356,7 +1337,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1379,7 +1360,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1406,7 +1387,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1438,7 +1419,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1453,7 +1434,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1469,9 +1450,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1488,9 +1467,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1526,7 +1503,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1550,7 +1527,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1573,7 +1550,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1600,7 +1577,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1632,7 +1609,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1647,7 +1624,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1663,9 +1640,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1682,9 +1657,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1720,7 +1693,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1744,7 +1717,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1768,7 +1741,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1795,7 +1768,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1828,7 +1801,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1843,7 +1816,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -1860,9 +1833,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1879,9 +1850,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -1924,7 +1893,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1948,7 +1917,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -1973,7 +1942,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2000,7 +1969,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2034,7 +2003,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2049,7 +2018,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2067,9 +2036,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2086,9 +2053,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2134,7 +2099,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2161,7 +2126,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2186,7 +2151,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2216,7 +2181,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2250,7 +2215,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2268,9 +2233,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2311,7 +2274,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2335,7 +2298,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; SI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2357,7 +2320,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2384,7 +2347,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2415,7 +2378,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2430,9 +2393,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2465,7 +2426,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2489,7 +2450,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2511,7 +2472,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2538,7 +2499,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2569,7 +2530,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_nnan_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2584,9 +2545,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_nnan_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2619,7 +2578,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2643,7 +2602,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2665,7 +2624,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2692,7 +2651,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2723,7 +2682,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_fast_call_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2738,9 +2697,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_fast_call_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2785,7 +2742,7 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2809,7 +2766,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2831,7 +2788,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2858,7 +2815,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2889,7 +2846,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -2904,9 +2861,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -2939,7 +2894,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2963,7 +2918,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -2985,7 +2940,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3012,7 +2967,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3043,7 +2998,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3058,9 +3013,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3093,7 +3046,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3117,7 +3070,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3140,7 +3093,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3167,7 +3120,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3199,7 +3152,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3214,7 +3167,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3230,9 +3183,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3249,9 +3200,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3287,7 +3236,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3311,7 +3260,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3333,7 +3282,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3360,7 +3309,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3391,7 +3340,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3406,9 +3355,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3441,7 +3388,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3465,7 +3412,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3487,7 +3434,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3514,7 +3461,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3545,7 +3492,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3560,9 +3507,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3595,7 +3540,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3619,7 +3564,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3641,7 +3586,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3668,7 +3613,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3699,7 +3644,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3714,9 +3659,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3749,7 +3692,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3773,7 +3716,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3795,7 +3738,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3822,7 +3765,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3853,7 +3796,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -3868,9 +3811,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -3903,7 +3844,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3927,7 +3868,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -3949,7 +3890,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3976,7 +3917,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4007,7 +3948,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4022,9 +3963,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4057,7 +3996,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4081,7 +4020,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4103,7 +4042,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4130,7 +4069,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4161,7 +4100,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4176,9 +4115,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4211,7 +4148,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4235,7 +4172,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4257,7 +4194,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4284,7 +4221,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4315,7 +4252,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4330,9 +4267,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4365,7 +4300,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4389,7 +4324,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4411,7 +4346,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4438,7 +4373,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4469,7 +4404,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4484,9 +4419,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4519,7 +4452,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4543,7 +4476,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4565,7 +4498,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4592,7 +4525,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4623,7 +4556,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4638,9 +4571,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4673,7 +4604,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4697,7 +4628,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4719,7 +4650,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4746,7 +4677,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4777,7 +4708,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4792,9 +4723,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4827,7 +4756,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4851,7 +4780,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -4873,7 +4802,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4900,7 +4829,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4931,7 +4860,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -4946,9 +4875,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -4981,7 +4908,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5005,7 +4932,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5027,7 +4954,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5054,7 +4981,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5085,7 +5012,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5100,9 +5027,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5135,7 +5060,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5159,7 +5084,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5181,7 +5106,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5208,7 +5133,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5239,7 +5164,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5254,9 +5179,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5289,7 +5212,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5313,7 +5236,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5335,7 +5258,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5362,7 +5285,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5393,7 +5316,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5408,9 +5331,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5446,7 +5367,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5470,7 +5391,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5492,7 +5413,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5519,7 +5440,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5550,7 +5471,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5565,9 +5486,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5604,7 +5523,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5637,7 +5556,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5669,7 +5588,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5704,7 +5623,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5743,7 +5662,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5766,9 +5685,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5808,7 +5725,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -5841,7 +5758,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -5873,7 +5790,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -5908,7 +5825,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -5947,7 +5864,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -5970,9 +5887,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -5981,10 +5896,9 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 +; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v1, v2 ; GFX11-SDAG-NEXT: v_min_f32_e32 v3, v4, v3 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 @@ -5997,9 +5911,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6039,7 +5951,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6072,7 +5984,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6104,7 +6016,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6139,7 +6051,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6178,7 +6090,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6201,9 +6113,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; ; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6243,7 +6153,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6272,7 +6182,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6300,7 +6210,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6333,7 +6243,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6370,7 +6280,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6391,9 +6301,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6414,9 +6322,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; ; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6453,7 +6359,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6480,7 +6386,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6505,7 +6411,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6535,7 +6441,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6569,7 +6475,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6587,9 +6493,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6630,7 +6534,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6657,7 +6561,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6682,7 +6586,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6712,7 +6616,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6746,7 +6650,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6764,9 +6668,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6807,7 +6709,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -6834,7 +6736,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; SI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -6859,7 +6761,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -6889,7 +6791,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -6923,7 +6825,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -6941,9 +6843,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; ; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -6984,7 +6884,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7008,7 +6908,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; SI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7031,7 +6931,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7058,7 +6958,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7090,7 +6990,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7105,7 +7005,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7121,9 +7021,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7140,9 +7038,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa ; ; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7178,7 +7074,7 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7204,7 +7100,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7230,7 +7126,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7260,7 +7156,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7295,7 +7191,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7313,7 +7209,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7332,9 +7228,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7353,9 +7247,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; ; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7394,7 +7286,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7419,7 +7311,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7442,7 +7334,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7470,7 +7362,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7502,7 +7394,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_global_nnans_min_max_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -7518,9 +7410,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_global_nnans_min_max_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -7551,7 +7441,7 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7570,7 +7460,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7597,7 +7487,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7616,7 +7506,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7637,7 +7527,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -7649,14 +7539,13 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; ; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -7677,7 +7566,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s10, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -7708,7 +7597,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 @@ -7755,7 +7644,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7788,7 +7677,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7825,7 +7714,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -7843,9 +7732,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -7887,7 +7774,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -7905,7 +7792,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: two_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -7923,7 +7810,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: two_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -7942,7 +7829,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: two_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -7963,7 +7850,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: two_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -7976,9 +7863,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-SDAG-LABEL: two_non_inline_constant: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -7994,15 +7879,14 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-GISEL-LABEL: two_non_inline_constant: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -8024,7 +7908,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: one_non_inline_constant: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8046,7 +7930,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: one_non_inline_constant: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -8068,7 +7952,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: one_non_inline_constant: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8090,7 +7974,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: one_non_inline_constant: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -8114,7 +7998,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: one_non_inline_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8130,9 +8014,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: one_non_inline_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -8165,7 +8047,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-SDAG-LABEL: two_non_inline_constant_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -8191,7 +8073,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; SI-GISEL-LABEL: two_non_inline_constant_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -8217,7 +8099,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-SDAG-LABEL: two_non_inline_constant_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8243,7 +8125,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; VI-GISEL-LABEL: two_non_inline_constant_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41800000 @@ -8271,7 +8153,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -8291,7 +8173,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 @@ -8311,9 +8193,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -8335,15 +8215,13 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; ; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0x41800000 :: v_dual_add_f32 v3, 0.5, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2 ; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1 ; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 3a55b2d50a5e54..7337d90b4bea63 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -37,7 +37,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -97,7 +97,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -169,7 +169,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -199,7 +199,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -229,7 +229,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -270,7 +270,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -368,7 +368,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -410,7 +410,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -444,7 +444,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -478,7 +478,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -508,7 +508,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -612,7 +612,7 @@ entry: define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -646,7 +646,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -680,7 +680,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_0_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -759,7 +759,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { ; SI-LABEL: test_fmin3_olt_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -793,7 +793,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_fmin3_olt_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -827,7 +827,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_fmin3_olt_1_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -861,7 +861,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_fmin3_olt_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll index 85653ded63ce6f..d20c39d5103649 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -26,7 +26,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_uge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -57,7 +57,7 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ugt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ule_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -182,7 +182,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ult_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -213,7 +213,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -234,7 +234,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_oge_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -265,7 +265,7 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -286,7 +286,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ogt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,7 +317,7 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -338,7 +338,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_ole_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -369,7 +369,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: s_mov_b32 s11, s7 @@ -390,7 +390,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: test_fmin_legacy_olt_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 817e6dd87361ff..45f6bff10f45ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -262,8 +262,8 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimumi_f32_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS @@ -286,8 +286,8 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr ; GCN-LABEL: fminimum_f16_move_to_valu: ; GCN: ; %bb.0: ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index c60b9858abd836..7830c91851bfa7 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -15,7 +15,7 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_fadd_use_test_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -46,7 +46,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_fadd_use_test_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0 @@ -79,20 +79,20 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, float %x, [8 x i32], float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmac_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-NEXT: s_load_dword s3, s[6:7], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-NEXT: s_load_dword s3, s[4:5], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s2, s0, 4 -; VI-NEXT: v_add_f32_e64 v2, s4, s4 +; VI-NEXT: v_add_f32_e64 v2, s6, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 +; VI-NEXT: v_mac_f32_e64 v3, s6, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -101,9 +101,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX10-LABEL: multiple_use_fadd_fmac_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, s2, s2 @@ -117,13 +117,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; GFX11-LABEL: multiple_use_fadd_fmac_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, s4, s4 -; GFX11-NEXT: v_fma_f32 v2, s4, 2.0, s5 +; GFX11-NEXT: v_add_f32_e64 v1, s2, s2 +; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc @@ -142,7 +142,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -161,7 +161,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX10-LABEL: multiple_use_fadd_fmad_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_use_fadd_fmad_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| @@ -198,21 +198,21 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 { ; VI-LABEL: multiple_use_fadd_multi_fmad_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s6, s4, 4 +; VI-NEXT: s_add_u32 s4, s6, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_addc_u32 s5, s7, 0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm @@ -220,23 +220,23 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; GFX10-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_fma_f32 v1, |s0|, 2.0, s1 ; GFX10-NEXT: v_fma_f32 v2, |s0|, 2.0, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[4:5] offset:4 +; GFX10-NEXT: global_store_dword v0, v2, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: multiple_use_fadd_multi_fmad_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v1, |s4|, 2.0, s5 @@ -261,8 +261,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -275,8 +275,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn2_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, s2, -4.0 @@ -288,12 +288,12 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -310,8 +310,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; VI-LABEL: fmul_x2_xn3_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 @@ -325,8 +325,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX10-LABEL: fmul_x2_xn3_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 @@ -338,12 +338,12 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; GFX11-LABEL: fmul_x2_xn3_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s4 +; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -360,8 +360,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_fadd_use_test_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -378,8 +378,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_fadd_use_test_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -396,13 +396,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -414,12 +414,12 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX10-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dword s0, s[6:7], 0x8 +; GFX10-FLUSH-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0| @@ -433,13 +433,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-DENORM-NEXT: s_clause 0x1 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s3, -1.0 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -447,7 +448,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0 ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0 -; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -455,12 +455,13 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0 -; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s3, -1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0| ; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -471,7 +472,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0 -; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -496,14 +496,14 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, s6, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-DENORM-NEXT: v_add_f16_e64 v2, s6, s6 ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -517,12 +517,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, s6, s6 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 @@ -530,7 +530,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 +; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s6, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -539,8 +539,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -555,8 +555,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 @@ -571,13 +571,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s4, s4 -; GFX11-DENORM-NEXT: v_fma_f16 v2, s4, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s2, s2 +; GFX11-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -589,12 +589,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s4, s4 -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -617,14 +617,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 +; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-DENORM-NEXT: v_add_f16_e64 v2, |s6|, |s6| ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 @@ -638,14 +638,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 -; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 +; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s6|, |s6| ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 @@ -660,8 +660,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16 @@ -676,8 +676,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| @@ -692,13 +692,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s4|, |s4| -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s2 +; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2| +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc @@ -710,12 +710,12 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2| +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc @@ -739,9 +739,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { ; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-DENORM-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -762,9 +762,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; ; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 -; VI-FLUSH-NEXT: s_load_dword s6, s[6:7], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -786,14 +786,14 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x2 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 -; GFX10-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 +; GFX10-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, s1 +; GFX10-DENORM-NEXT: v_fma_f16 v1, |s6|, 2.0, s0 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3] offset:2 @@ -803,12 +803,12 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x2 -; GFX10-FLUSH-NEXT: s_load_dword s4, s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX10-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| +; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s6|, |s6| ; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 @@ -821,17 +821,17 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x2 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s1 -; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3] dlc +; GFX11-DENORM-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s3 +; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s2 +; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3] offset:2 dlc +; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-DENORM-NEXT: s_nop 0 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -840,19 +840,19 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x2 -; GFX11-FLUSH-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-FLUSH-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4| -; GFX11-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0 -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[2:3] dlc +; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0 +; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s3, v0 +; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[2:3] offset:2 dlc +; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,8 +873,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -887,8 +887,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, s2, -4.0 @@ -900,13 +900,13 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, s4, -4.0 +; GFX11-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 @@ -925,8 +925,8 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 { ; VI-LABEL: fmul_x2_xn3_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 @@ -940,8 +940,8 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX10-LABEL: fmul_x2_xn3_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mul_f16_e64 v0, 0xc600, s2 @@ -953,13 +953,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; GFX11-LABEL: fmul_x2_xn3_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s4 +; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mul_f16_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index 7c1c970b3fef78..98faaacf1dfb0a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -7,58 +7,58 @@ define amdgpu_kernel void @fmul_f16( ; SI-LABEL: fmul_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fmul_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s14, s10 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_mov_b32 s14, s2 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s12, s6 ; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s11 -; GFX89-NEXT: s_mov_b32 s2, s10 -; GFX89-NEXT: s_mov_b32 s3, s11 +; GFX89-NEXT: s_mov_b32 s15, s3 +; GFX89-NEXT: s_mov_b32 s10, s2 +; GFX89-NEXT: s_mov_b32 s11, s3 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s4 -; GFX89-NEXT: s_mov_b32 s9, s5 +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -93,7 +93,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_a( ; SI-LABEL: fmul_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; ; GFX89-LABEL: fmul_f16_imm_a: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; ; GFX11-LABEL: fmul_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -160,7 +160,7 @@ entry: define amdgpu_kernel void @fmul_f16_imm_b( ; SI-LABEL: fmul_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; ; GFX89-LABEL: fmul_f16_imm_b: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -198,7 +198,7 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; ; GFX11-LABEL: fmul_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -227,21 +227,21 @@ entry: define amdgpu_kernel void @fmul_v2f16( ; SI-LABEL: fmul_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -256,60 +256,60 @@ define amdgpu_kernel void @fmul_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -343,7 +343,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_a( ; SI-LABEL: fmul_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; VI-LABEL: fmul_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -390,7 +390,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX9-LABEL: fmul_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -409,7 +409,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_a( ; ; GFX11-LABEL: fmul_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -438,7 +438,7 @@ entry: define amdgpu_kernel void @fmul_v2f16_imm_b( ; SI-LABEL: fmul_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; VI-LABEL: fmul_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -485,7 +485,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX9-LABEL: fmul_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -504,7 +504,7 @@ define amdgpu_kernel void @fmul_v2f16_imm_b( ; ; GFX11-LABEL: fmul_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -533,21 +533,21 @@ entry: define amdgpu_kernel void @fmul_v4f16( ; SI-LABEL: fmul_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -574,26 +574,26 @@ define amdgpu_kernel void @fmul_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fmul_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v1, v3, v1 @@ -601,37 +601,37 @@ define amdgpu_kernel void @fmul_v4f16( ; VI-NEXT: v_mul_f16_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fmul_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fmul_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -666,7 +666,7 @@ entry: define amdgpu_kernel void @fmul_v4f16_imm_a( ; SI-LABEL: fmul_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -701,7 +701,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; VI-LABEL: fmul_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -725,7 +725,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX9-LABEL: fmul_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -746,7 +746,7 @@ define amdgpu_kernel void @fmul_v4f16_imm_a( ; ; GFX11-LABEL: fmul_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 9300dfcb16e8ad..718be90eb75fc3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -22,7 +22,7 @@ declare half @llvm.fabs.f16(half) #1 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -62,7 +62,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -78,7 +78,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -92,7 +92,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -111,7 +111,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -136,7 +136,7 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -156,7 +156,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-FLUSH-LABEL: fmul_fadd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -192,7 +192,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2 @@ -208,7 +208,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -222,7 +222,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-FLUSH-LABEL: fmul_fadd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: s_clause 0x2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2 @@ -286,7 +286,7 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; VI-FLUSH-LABEL: fmul_fadd_contract_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -306,7 +306,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; VI-DENORM-LABEL: fmul_fadd_contract_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 @@ -326,7 +326,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_clause 0x2 @@ -342,7 +342,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_clause 0x2 @@ -356,7 +356,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_clause 0x2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add ; ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: s_clause 0x2 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -419,7 +419,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -437,7 +437,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -451,7 +451,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -464,9 +464,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -483,9 +481,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -513,7 +509,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -531,7 +527,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -549,7 +545,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -563,7 +559,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -576,9 +572,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -595,9 +589,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -625,7 +617,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_a_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -643,7 +635,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -661,7 +653,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -675,7 +667,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -689,7 +681,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -702,9 +694,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -721,9 +711,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -740,9 +728,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -773,7 +759,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; VI-FLUSH-LABEL: fadd_b_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -791,7 +777,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -809,7 +795,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -823,7 +809,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -837,7 +823,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -850,9 +836,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -869,9 +853,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -888,9 +870,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, ; ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -921,7 +901,7 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out, define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -939,7 +919,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -957,7 +937,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -971,7 +951,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -984,9 +964,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1003,9 +981,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1033,7 +1009,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1051,7 +1027,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1069,7 +1045,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1083,7 +1059,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1096,9 +1072,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1115,9 +1089,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt ; ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1147,7 +1119,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1165,7 +1137,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1183,7 +1155,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1197,7 +1169,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1210,9 +1182,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1229,9 +1199,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1261,7 +1229,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -1279,7 +1247,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 @@ -1297,7 +1265,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1311,7 +1279,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -1324,9 +1292,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1343,9 +1309,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -1375,7 +1339,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1400,7 +1364,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1425,7 +1389,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1441,7 +1405,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1457,7 +1421,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1472,9 +1436,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-FLUSH-LABEL: mad_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1493,9 +1455,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1514,9 +1474,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1550,7 +1508,7 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1575,7 +1533,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1600,7 +1558,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1616,7 +1574,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1632,7 +1590,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1647,9 +1605,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1668,9 +1624,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1689,9 +1643,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1725,7 +1677,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1750,7 +1702,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1775,7 +1727,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1791,7 +1743,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1807,7 +1759,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1822,9 +1774,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1843,9 +1793,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1864,9 +1812,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1901,7 +1847,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -1926,7 +1872,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -1951,7 +1897,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1967,7 +1913,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1983,7 +1929,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1998,9 +1944,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2019,9 +1963,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2040,9 +1982,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2077,7 +2017,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: neg_neg_mad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2102,7 +2042,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2127,7 +2067,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2143,7 +2083,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2159,7 +2099,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2174,9 +2114,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2195,9 +2133,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2216,9 +2152,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2254,7 +2188,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_fabs_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 @@ -2279,7 +2213,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 @@ -2304,7 +2238,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2320,7 +2254,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2336,7 +2270,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -2351,9 +2285,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2372,9 +2304,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2393,9 +2323,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -2430,7 +2358,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2448,7 +2376,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2466,7 +2394,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2480,7 +2408,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2494,7 +2422,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2507,9 +2435,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2526,9 +2452,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2545,9 +2469,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2577,7 +2499,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 @@ -2595,7 +2517,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 @@ -2613,7 +2535,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2627,7 +2549,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2641,7 +2563,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -2654,9 +2576,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2673,9 +2593,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc @@ -2692,9 +2610,7 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index ba8b6fb80518fc..f411a76e75ab69 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; SI-LABEL: fnearbyint_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,24 +28,23 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; ; CI-LABEL: fnearbyint_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_rndne_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fnearbyint_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f16_e32 v2, s4 +; VI-NEXT: v_rndne_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -54,11 +53,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { ; GFX11-LABEL: fnearbyint_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v1, s4 +; GFX11-NEXT: v_rndne_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -71,8 +70,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 { define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; SICI-LABEL: fnearbyint_f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dword s4, s[2:3], 0xb -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SICI-NEXT: s_load_dword s4, s[0:1], 0xb +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -82,10 +81,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; ; VI-LABEL: fnearbyint_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rndne_f32_e32 v2, s4 +; VI-NEXT: v_rndne_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -94,11 +93,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 { ; GFX11-LABEL: fnearbyint_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s4 +; GFX11-NEXT: v_rndne_f32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -112,7 +111,7 @@ entry: define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v2f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s7, 0xf000 ; SICI-NEXT: s_mov_b32 s6, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -125,7 +124,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 @@ -136,7 +135,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; GFX11-LABEL: fnearbyint_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 @@ -154,8 +153,8 @@ entry: define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 { ; SICI-LABEL: fnearbyint_v4f32: ; SICI: ; %bb.0: ; %entry -; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SICI-NEXT: s_mov_b32 s3, 0xf000 ; SICI-NEXT: s_mov_b32 s2, -1 ; SICI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,8 +167,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fnearbyint_v4f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -183,8 +182,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; GFX11-LABEL: fnearbyint_v4f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v3, s7 @@ -204,7 +203,7 @@ entry: define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-LABEL: nearbyint_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s8, -2 @@ -228,7 +227,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; CI-LABEL: nearbyint_f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -238,7 +237,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: nearbyint_f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -248,7 +247,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; ; GFX11-LABEL: nearbyint_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] @@ -264,41 +263,41 @@ entry: define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: nearbyint_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 ; SI-NEXT: s_mov_b32 s9, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_bfi_b32 v1, s10, v6, v7 -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] +; SI-NEXT: v_mov_b32_e32 v8, s2 +; SI-NEXT: v_mov_b32_e32 v9, s1 +; SI-NEXT: v_mov_b32_e32 v10, s0 +; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1] ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] +; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: nearbyint_v2f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -309,8 +308,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: nearbyint_v2f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] ; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] @@ -322,8 +321,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; GFX11-LABEL: nearbyint_v2f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] @@ -341,8 +340,8 @@ entry: define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: nearbyint_v4f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_brev_b32 s14, -2 @@ -391,8 +390,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; CI-LABEL: nearbyint_v4f64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -406,8 +405,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: nearbyint_v4f64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] ; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9] @@ -426,8 +425,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; GFX11-LABEL: nearbyint_v4f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 74e2b9ea714258..b5440b9c38c9f2 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3016,41 +3016,41 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s4, 0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_select_infloop_regression_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s4, 0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] -; VI-NEXT: s_cselect_b32 s0, 0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %i = select i1 %arg1, double 0.0, double %arg @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: s_bitcmp1_b32 s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_bitcmp1_b32 s2, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; ; VI-LABEL: s_fneg_select_infloop_regression_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_bitcmp1_b32 s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s1, 1, s1 ; SI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s1, 1, s1 ; VI-NEXT: s_cselect_b32 s0, 0, s0 @@ -3216,8 +3216,8 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_select_infloop_regression_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3235,8 +3235,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s6, 0 @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fabs_select_infloop_regression_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fabs_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) { define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) { ; SI-LABEL: s_fneg_fabs_select_infloop_regression: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 ; ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 8267bb9f5450f8..4364b32e62f8c9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -7,12 +7,12 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -23,8 +23,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -36,8 +36,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fadd_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -49,13 +49,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_sub_f16_e64 v1, s2, |s4| +; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -70,13 +70,13 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) { ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -87,8 +87,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; VI-LABEL: fneg_fabs_fmul_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -100,8 +100,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; ; GFX9-LABEL: fneg_fabs_fmul_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 @@ -113,13 +113,13 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-LABEL: fneg_fabs_fmul_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mul_f16_e64 v1, s2, -|s4| +; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -137,8 +137,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI-LABEL: fneg_fabs_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -149,8 +149,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; VI-LABEL: fneg_fabs_free_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -161,8 +161,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; ; GFX9-LABEL: fneg_fabs_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -173,10 +173,10 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: fneg_fabs_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -193,8 +193,8 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI-LABEL: fneg_fabs_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -205,8 +205,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fneg_fabs_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -217,8 +217,8 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; ; GFX9-LABEL: fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bitset1_b32 s2, 15 @@ -229,10 +229,10 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fneg_fabs_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_bitset1_b32 s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -248,7 +248,7 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -262,7 +262,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fabs_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -273,7 +273,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fabs_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -293,12 +293,12 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -314,8 +314,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -331,8 +331,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -344,11 +344,11 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s4 +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,8 +367,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) { ; CI-LABEL: s_fneg_fabs_v2f16_bc_src: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -379,8 +379,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_fneg_fabs_v2f16_bc_src: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -391,8 +391,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -403,10 +403,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -422,7 +422,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -435,7 +435,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX9-LABEL: fneg_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -447,7 +447,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX11-LABEL: fneg_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 @@ -467,12 +467,12 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -487,8 +487,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fold_user_fneg_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 @@ -503,8 +503,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; ; GFX9-LABEL: fold_user_fneg_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -515,11 +515,11 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; GFX11-LABEL: fold_user_fneg_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -536,8 +536,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -553,8 +553,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; VI-LABEL: s_fneg_multi_use_fabs_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -570,11 +570,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -585,8 +585,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) @@ -609,8 +609,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) { ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -633,8 +633,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v0, v2, s[0:1] @@ -668,8 +668,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index d0115523b18823..2c9042ec17da88 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -20,8 +20,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, ; ; VI-LABEL: fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -40,7 +40,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) { ; SI-LABEL: v_fneg_fabs_fadd_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -52,7 +52,7 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_fneg_fabs_fadd_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -73,8 +73,8 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, double %y) { ; SI-LABEL: fneg_fabs_fmul_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -88,8 +88,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, ; ; VI-LABEL: fneg_fabs_fmul_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -108,7 +108,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: fneg_fabs_free_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -122,7 +122,7 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: fneg_fabs_free_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_or_b32 s0, s3, 0x80000000 @@ -174,8 +174,8 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %in) { ; SI-LABEL: fneg_fabs_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s5, 31 @@ -187,14 +187,14 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl ; ; VI-LABEL: fneg_fabs_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_bitset1_b32 s3, 31 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in) @@ -206,8 +206,8 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fneg_fabs_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -222,8 +222,8 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % ; ; VI-LABEL: fneg_fabs_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s5, 0x80000000 @@ -244,8 +244,8 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fneg_fabs_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -267,8 +267,8 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> % ; ; VI-LABEL: fneg_fabs_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s7, 31 ; VI-NEXT: s_bitset1_b32 s5, 31 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 6446145bbfe2ad..3c000d4fa63a38 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fadd_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fadd_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) { ; SI-LABEL: fneg_fabsf_fmul_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fmul_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| @@ -67,11 +67,11 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_fabsf_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -79,10 +79,10 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_fabsf_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -129,11 +129,11 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: s_or_b32 s4, s2, 0x80000000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -141,10 +141,10 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b32 s2, s4, 0x80000000 +; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_fneg_fabsf_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_fneg_fabsf_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fneg_fabsf_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s3, 31 @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 @@ -232,8 +232,8 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-LABEL: fneg_fabsf_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitset1_b32 s7, 31 @@ -250,8 +250,8 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> % ; ; VI-LABEL: fneg_fabsf_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s7, 0x80000000 ; VI-NEXT: s_or_b32 s3, s6, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 63ccaafeda88f4..cd1ec85eb8d0f3 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1475,11 +1475,11 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) { ; GFX7-LABEL: multiple_uses_fneg_select_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s8, s[6:7], 0x4 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x6 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x4 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bitcmp1_b32 s8, 0 +; GFX7-NEXT: s_bitcmp1_b32 s6, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX7-NEXT: v_mov_b32_e32 v0, s3 @@ -1497,12 +1497,12 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; ; GFX9-LABEL: multiple_uses_fneg_select_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s8, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x18 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bitcmp1_b32 s8, 0 +; GFX9-NEXT: s_bitcmp1_b32 s6, 0 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -1519,13 +1519,13 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX11-LABEL: multiple_uses_fneg_select_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x18 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x18 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s5 -; GFX11-NEXT: s_bitcmp1_b32 s8, 0 +; GFX11-NEXT: s_bitcmp1_b32 s2, 0 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo @@ -1549,7 +1549,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; GCN-LABEL: fnge_select_f32_multi_use_regression: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -1562,7 +1562,7 @@ define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) { ; ; GFX11-LABEL: fnge_select_f32_multi_use_regression: ; GFX11: ; %bb.0: ; %.entry -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, s0, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 40982347f3ca00..31c1389c940208 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI-LABEL: s_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -20,8 +20,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX8-LABEL: s_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -32,8 +32,8 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; ; GFX9-LABEL: s_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -44,10 +44,10 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX11-LABEL: s_fneg_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX8-LABEL: v_fneg_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -92,7 +92,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_fneg_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] @@ -103,9 +103,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: v_fneg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -127,8 +125,8 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI-LABEL: s_fneg_free_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -139,8 +137,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX8-LABEL: s_fneg_free_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -151,8 +149,8 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; ; GFX9-LABEL: s_fneg_free_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x8000 @@ -163,10 +161,10 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX11-LABEL: s_fneg_free_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x8000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] @@ -182,7 +180,7 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -199,7 +197,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -213,7 +211,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: v_fneg_fold_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -224,7 +222,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: v_fneg_fold_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -244,8 +242,8 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: s_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -256,8 +254,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX8-LABEL: s_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -268,8 +266,8 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; ; GFX9-LABEL: s_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -280,10 +278,10 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX11-LABEL: s_fneg_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -298,7 +296,7 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND @@ -312,7 +310,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX9-LABEL: s_fneg_v2f16_nonload: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s2 ; GFX9-NEXT: ;;#ASMEND @@ -325,7 +323,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; ; GFX11-LABEL: s_fneg_v2f16_nonload: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s2 ; GFX11-NEXT: ;;#ASMEND @@ -347,7 +345,7 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -361,7 +359,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: v_fneg_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -375,7 +373,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_fneg_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] @@ -386,9 +384,7 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_fneg_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] @@ -410,8 +406,8 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-LABEL: fneg_free_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -422,8 +418,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX8-LABEL: fneg_free_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -434,8 +430,8 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; ; GFX9-LABEL: fneg_free_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000 @@ -446,10 +442,10 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX11-LABEL: fneg_free_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -465,7 +461,7 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -491,7 +487,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +503,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -518,7 +514,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; ; GFX11-LABEL: v_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -538,7 +534,7 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -559,7 +555,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -576,7 +572,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX9-LABEL: v_extract_fneg_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -592,7 +588,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; ; GFX11-LABEL: v_extract_fneg_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] @@ -623,7 +619,7 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 { ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -639,7 +635,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX9-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] @@ -653,7 +649,7 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; ; GFX11-LABEL: v_extract_fneg_no_fold_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index e447429539e6ff..d78bdfe08772a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: s_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,10 +19,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: s_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,10 +32,10 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: s_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { ; SI-LABEL: s_fneg_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -65,7 +65,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 @@ -78,7 +78,7 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 @@ -97,8 +97,8 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { ; SI-LABEL: s_fneg_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -115,8 +115,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; ; VI-LABEL: s_fneg_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s7, 0x80000000 ; VI-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -134,8 +134,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; GFX11-LABEL: s_fneg_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000 ; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000 @@ -157,8 +157,8 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fsub0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -168,10 +168,10 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fsub0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -180,11 +180,11 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fsub0_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -198,8 +198,8 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: fneg_free_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,10 +210,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: fneg_free_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -223,10 +223,10 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: fneg_free_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -242,8 +242,8 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fneg_fold_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fneg_fold_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, -s4, s4 +; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -265,11 +265,11 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { ; GFX11-LABEL: fneg_fold_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4 +; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -284,8 +284,8 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: bitpreserve_fneg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,10 +295,10 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; ; VI-LABEL: bitpreserve_fneg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0 +; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -307,11 +307,11 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in ; GFX11-LABEL: bitpreserve_fneg_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0 +; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -327,8 +327,8 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -339,10 +339,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s4, 0x80000000 +; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -352,10 +352,10 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80000000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -380,8 +380,8 @@ define i32 @v_fneg_i32(i32 %in) { define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_fneg_i32_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -391,10 +391,10 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_fneg_i32_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -403,11 +403,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { ; GFX11-LABEL: s_fneg_i32_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -434,7 +434,7 @@ define float @v_fneg_i32_fp_use(i32 %in) { define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -448,7 +448,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -488,7 +488,7 @@ define i64 @v_fneg_i64(i64 %in) { define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_fneg_i64_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -500,7 +500,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: s_fneg_i64_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { ; ; GFX11-LABEL: s_fneg_i64_fp_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 @@ -550,24 +550,23 @@ define i16 @v_fneg_i16(i16 %in) { define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; SI-LABEL: s_fneg_i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4 +; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -576,11 +575,11 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { ; GFX11-LABEL: s_fneg_i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4 +; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -620,8 +619,8 @@ define half @v_fneg_i16_fp_use(i16 %in) { define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,15 +631,15 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; ; VI-LABEL: s_fneg_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -650,10 +649,10 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; GFX11-LABEL: s_fneg_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s4, 0x80008000 +; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -696,35 +695,34 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { ; SI-LABEL: s_fneg_v2i16_fp_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_v2i16_fp_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 +; VI-NEXT: s_xor_b32 s3, s3, 0x8000 ; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s4, 0x8000 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_add_f16_e64 v1, s3, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -735,11 +733,11 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; GFX11-LABEL: s_fneg_v2i16_fp_use: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll index 07739e220c074e..b35de032030043 100644 --- a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll +++ b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @store_global(ptr addrspace(1) %ptr) { ; FORCESC0SC1-LABEL: store_global: ; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 ; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 ; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) @@ -15,7 +15,7 @@ define amdgpu_kernel void @store_global(ptr addrspace(1) %ptr) { ; ; NOSC0SC1-LABEL: store_global: ; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 ; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 ; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) @@ -29,7 +29,7 @@ entry: define amdgpu_kernel void @store_flat(ptr addrspace(0) %ptr) { ; FORCESC0SC1-LABEL: store_flat: ; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FORCESC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 ; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) ; FORCESC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -38,7 +38,7 @@ define amdgpu_kernel void @store_flat(ptr addrspace(0) %ptr) { ; ; NOSC0SC1-LABEL: store_flat: ; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOSC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 ; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) ; NOSC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -52,7 +52,7 @@ entry: define amdgpu_kernel void @store_lds(ptr addrspace(3) %ptr) { ; GCN-LABEL: store_lds: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -66,7 +66,7 @@ entry: define amdgpu_kernel void @store_scratch(ptr addrspace(5) %ptr) { ; FORCESC0SC1-LABEL: store_scratch: ; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dword s0, s[2:3], 0x24 +; FORCESC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 ; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 ; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) ; FORCESC0SC1-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 @@ -74,7 +74,7 @@ define amdgpu_kernel void @store_scratch(ptr addrspace(5) %ptr) { ; ; NOSC0SC1-LABEL: store_scratch: ; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dword s0, s[2:3], 0x24 +; NOSC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 ; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 ; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) ; NOSC0SC1-NEXT: scratch_store_dword off, v0, s0 @@ -102,7 +102,7 @@ main_body: define amdgpu_kernel void @store_global_atomic(ptr addrspace(1) %ptr) { ; FORCESC0SC1-LABEL: store_global_atomic: ; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 ; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 ; FORCESC0SC1-NEXT: buffer_wbl2 sc1 @@ -112,7 +112,7 @@ define amdgpu_kernel void @store_global_atomic(ptr addrspace(1) %ptr) { ; ; NOSC0SC1-LABEL: store_global_atomic: ; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 ; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 ; NOSC0SC1-NEXT: buffer_wbl2 sc1 @@ -127,7 +127,7 @@ entry: define amdgpu_kernel void @store_global_atomic_system(ptr addrspace(1) %ptr) { ; GCN-LABEL: store_global_atomic_system: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 1914b74be1909b..f4a9476dc63cbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -16,7 +16,7 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE @@ -27,7 +27,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE @@ -42,7 +42,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE @@ -53,7 +53,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE @@ -132,7 +132,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -141,7 +141,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -180,7 +180,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -189,7 +189,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -228,7 +228,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -238,7 +238,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 95d8ca391b8438..fd27a5f4a8d6e4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -14,17 +14,17 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> % define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -37,7 +37,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -77,7 +77,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -89,7 +89,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_mov_b32 s0, 0 @@ -183,17 +183,17 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -228,17 +228,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 @@ -273,17 +273,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3] ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] @@ -319,7 +319,7 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -329,7 +329,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SE @@ -369,7 +369,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -379,7 +379,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SE @@ -416,4 +416,298 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ret <2 x i16> %ret } +define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 1023 + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret float %result +} + +define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 -256 + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret float %result +} + +define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 1023 + %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret void +} + +define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %ptr, i64 -256 + %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret <2 x half> %result +} + +define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret <2 x half> %result +} + +define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 + %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret void +} + +define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 + %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret <2 x i16> %result +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_ret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 + %result = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret <2 x i16> %result +} + +define void @flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__posoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 1023 + %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret void +} + +define void @flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset(ptr %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_intrinsic_noret__negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-1024 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x i16>, ptr %ptr, i64 -256 + %unused = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0.v2bf16(ptr %gep, <2 x i16> %data) + ret void +} + attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index fb731cc00d3f01..18d2e52e8f9002 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -9,24 +9,24 @@ declare double @llvm.fabs.f64(double) #1 define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isinf_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x204 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isinf_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -36,11 +36,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; GFX11-LABEL: test_isinf_pattern: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -57,24 +57,24 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_nlg_f32_e64 s[4:5], |s4|, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_not_isinf_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s4|, v0 +; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -84,11 +84,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; GFX11-LABEL: test_not_isinf_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -105,7 +105,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_not_isinf_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -115,7 +115,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; VI-LABEL: test_not_isinf_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -125,7 +125,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % ; ; GFX11-LABEL: test_not_isinf_pattern_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -142,24 +142,24 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture % define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -169,11 +169,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -192,24 +192,24 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -219,11 +219,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,8 +241,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -253,10 +253,10 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -266,11 +266,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -290,23 +290,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4 @@ -321,14 +321,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -346,7 +346,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocapture %out, float %x, float %y) #0 { ; SI-LABEL: test_isfinite_not_pattern_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 @@ -376,7 +376,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 @@ -401,23 +401,23 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_not_pattern_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_u_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e64 s[6:7], |s6|, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4 @@ -432,14 +432,14 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s2, s4, s4 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s4| +; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 +; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -457,24 +457,24 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -484,11 +484,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; GFX11-LABEL: test_isfinite_pattern_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -507,24 +507,24 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_commute_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4_commute_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -534,11 +534,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; GFX11-LABEL: test_isfinite_pattern_4_commute_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -557,16 +557,16 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: s_load_dword s1, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_o_f32_e32 vcc, s1, v1 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s1, v0 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_cmp_o_f32_e32 vcc, s0, v1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s0, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -574,14 +574,14 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; ; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x50 -; VI-NEXT: s_load_dword s1, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x50 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 -; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s5, v1 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] @@ -592,15 +592,15 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x50 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s3, s4, 0x1f8 -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s4, s5 +; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 +; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -618,8 +618,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -632,11 +632,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; ; VI-LABEL: test_isinf_pattern_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -646,11 +646,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; GFX11-LABEL: test_isinf_pattern_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x204 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -667,8 +667,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -684,11 +684,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -698,11 +698,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -721,8 +721,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -738,11 +738,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_pattern_4_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0 +; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -752,11 +752,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_pattern_4_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s4, 0x1f8 +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 105d9246880a49..587340c7aa342c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -20,8 +20,8 @@ declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -30,8 +30,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,19 +41,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -63,8 +63,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -75,8 +75,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen @@ -86,8 +86,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -96,8 +96,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -107,19 +107,19 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -129,8 +129,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -242,15 +242,14 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -258,48 +257,60 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x2 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s6 +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm @@ -307,8 +318,8 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -320,15 +331,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf -; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -336,15 +346,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -353,12 +362,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -368,15 +377,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_clause 0x2 +; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -384,14 +392,13 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -405,8 +412,8 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -415,8 +422,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -426,19 +433,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -448,8 +455,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -460,8 +467,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen @@ -471,8 +478,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -481,8 +488,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -492,19 +499,19 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -514,8 +521,8 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -627,7 +634,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -642,7 +649,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -657,7 +664,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -669,7 +676,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -681,7 +688,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -694,7 +701,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s4, 4 @@ -708,7 +715,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -722,7 +729,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -736,7 +743,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -748,7 +755,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -760,7 +767,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index e124aadf4e8c23..e3ed0fa4918845 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -18,8 +18,8 @@ declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,8 +28,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -39,19 +39,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -61,8 +61,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -72,8 +72,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -82,8 +82,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -93,19 +93,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -115,8 +115,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen @@ -219,15 +219,14 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; SI-NEXT: s_load_dword s0, s[2:3], 0xf -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b32 v1, v0 @@ -235,63 +234,74 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_clause 0x2 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v1, s6 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 4 offen glc slc -; GFX1100-NEXT: v_mov_b32_e32 v1, s6 +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v0, s2 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_SI-NEXT: s_load_dword s0, s[2:3], 0xf -; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b32 v1, v0 @@ -299,15 +309,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: ds_write_b32 v1, v0 @@ -316,12 +325,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX10-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 @@ -331,15 +340,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_clause 0x2 +; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; G_GFX1030-NEXT: s_load_dword s0, s[2:3], 0x3c +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc -; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 @@ -347,14 +355,13 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; G_GFX1100-NEXT: s_load_b32 s0, s[2:3], 0x3c +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc -; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: ds_store_b32 v1, v0 @@ -369,8 +376,8 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -379,8 +386,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -390,19 +397,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -412,8 +419,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -423,8 +430,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -433,8 +440,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -444,19 +451,19 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x1 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -466,8 +473,8 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; G_GFX1100: ; %bb.0: ; %main_body ; G_GFX1100-NEXT: s_clause 0x1 -; G_GFX1100-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen @@ -570,7 +577,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrspace(8) inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -585,7 +592,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -600,7 +607,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -612,7 +619,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -624,7 +631,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc @@ -637,7 +644,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -651,7 +658,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -665,7 +672,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -677,7 +684,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -689,7 +696,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index 81859dce04889d..d827ea0503a3b2 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -10,7 +10,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index c17be87834aeb7..03b8251ea4640d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp16_to_fp64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp16_to_fp64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index d8a726f251a01e..8ab82b722445e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -9,7 +9,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; GFX6-LABEL: test_convert_fp32_to_fp16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp32_to_fp16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: test_convert_fp32_to_fp16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index ce1fcccf4a17c8..5690b99e43ece4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -26,22 +26,22 @@ declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -73,12 +73,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -88,12 +88,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -109,22 +109,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -156,12 +156,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -171,12 +171,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -192,22 +192,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -239,12 +239,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -254,12 +254,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -275,22 +275,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -322,12 +322,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -337,12 +337,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -358,22 +358,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -405,12 +405,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -420,12 +420,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -441,22 +441,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -488,12 +488,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -503,12 +503,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -524,22 +524,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -571,12 +571,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -586,12 +586,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -607,22 +607,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -654,12 +654,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -669,12 +669,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -690,22 +690,22 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -737,12 +737,12 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -752,12 +752,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -773,22 +773,22 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX940-NEXT: s_endpgm @@ -820,12 +820,12 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -835,12 +835,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -856,22 +856,22 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -903,12 +903,12 @@ main_body: define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -918,12 +918,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -939,22 +939,22 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX940-NEXT: s_endpgm @@ -986,12 +986,12 @@ main_body: define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v2, s10 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1001,12 +1001,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-NEXT: s_load_dword s10, s[2:3], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x44 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt ; GFX940-NEXT: v_mov_b32_e32 v2, 0 @@ -1022,7 +1022,7 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1047,7 +1047,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1072,7 +1072,7 @@ main_body: define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s2 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, d ; ; GFX940-LABEL: global_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s2 @@ -1097,23 +1097,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1132,21 +1132,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB39_2: @@ -1159,20 +1159,20 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB40_2: @@ -1180,21 +1180,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB40_2: @@ -1207,23 +1207,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB41_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1242,21 +1242,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: .LBB41_2: @@ -1269,20 +1269,20 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: .LBB42_2: @@ -1290,21 +1290,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB42_2: @@ -1479,23 +1479,23 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB49_3 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -1512,21 +1512,21 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: .LBB49_2: @@ -1539,7 +1539,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1564,7 +1564,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1581,7 +1581,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -1593,7 +1593,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1610,7 +1610,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1636,7 +1636,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1760,7 +1760,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1771,7 +1771,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1806,7 +1806,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -1846,7 +1846,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1857,7 +1857,7 @@ define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmin_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1892,7 +1892,7 @@ main_body: define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; GFX90A-LABEL: flat_atomic_fmax_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 @@ -1903,7 +1903,7 @@ define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { ; ; GFX940-LABEL: flat_atomic_fmax_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: v_mov_b32_e32 v1, s1 @@ -1938,16 +1938,16 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB63_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] @@ -1959,16 +1959,16 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do ; ; GFX940-LABEL: local_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB63_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX940-NEXT: s_load_dword s6, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] @@ -2008,21 +2008,21 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_endpgm main_body: @@ -2056,19 +2056,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB67_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB67_2: @@ -2076,19 +2076,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB67_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB67_2: @@ -2101,19 +2101,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB68_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB68_2: @@ -2121,19 +2121,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB68_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB68_2: @@ -2146,19 +2146,19 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB69_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB69_2: @@ -2166,19 +2166,19 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX940-NEXT: s_cbranch_execz .LBB69_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 ; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB69_2: @@ -2256,6 +2256,264 @@ main_body: ret double %ret } +define double @flat_atomic_fadd_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) + ret double %ret +} + +define double @flat_atomic_fadd_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) + ret double %ret +} + +define void @flat_atomic_fadd_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) + ret void +} + +define void @flat_atomic_fadd_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) + ret void +} + +define double @flat_atomic_fmin_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) + ret double %ret +} + +define double @flat_atomic_fmin_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) + ret double %ret +} + +define void @flat_atomic_fmin_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) + ret void +} + +define void @flat_atomic_fmin_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) + ret void +} + +define double @flat_atomic_fmax_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) + ret double %ret +} + +define double @flat_atomic_fmax_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) + ret double %ret +} + +define void @flat_atomic_fmax_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 511 + %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) + ret void +} + +define void @flat_atomic_fmax_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { +; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %ptr, i64 -511 + %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) + ret void +} + attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll index f18f5752269e00..d610091840b958 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; ; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc ; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll index 6a2a8c3ce595d7..5f501fec24c2e4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -41,12 +41,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -54,9 +54,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -91,12 +91,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -104,9 +104,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -253,9 +253,9 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { ; SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -278,12 +278,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm @@ -291,9 +291,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf -; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 ; G_SI-NEXT: v_mov_b32_e32 v1, s5 @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -328,12 +328,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX10: ; %bb.0: ; %main_body ; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c +; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 ; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 ; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; G_GFX10-NEXT: s_endpgm @@ -341,9 +341,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; G_GFX1030: ; %bb.0: ; %main_body ; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ main_body: define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) { ; SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: v_mov_b32_e32 v0, s4 @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX7-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 3571f3545ad1a1..04ef30bd26aa51 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { ; ; VI-LABEL: fp_to_sint_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,8 +47,8 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i32_fabs: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -58,12 +58,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_i32_fabs: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e64 v0, |s4| +; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2| +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -87,7 +87,7 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in) define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_sint_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: fp_to_sint_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -193,37 +193,37 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_sint_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0x2f800000 -; SI-NEXT: s_mov_b32 s6, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0x2f800000 +; SI-NEXT: s_mov_b32 s2, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s4 -; SI-NEXT: v_mul_f32_e64 v1, |v0|, s5 +; SI-NEXT: v_trunc_f32_e32 v0, s0 +; SI-NEXT: v_mul_f32_e64 v1, |v0|, s1 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; SI-NEXT: v_floor_f32_e32 v1, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v1 -; SI-NEXT: v_fma_f32 v0, v1, s6, |v0| +; SI-NEXT: v_fma_f32 v0, v1, s2, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v2 ; SI-NEXT: v_xor_b32_e32 v0, v0, v2 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0x2f800000 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_mov_b32 s5, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s2 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s5, |v0| ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -294,7 +294,7 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 @@ -329,7 +329,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -452,17 +452,17 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_sint_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s5 -; SI-NEXT: v_trunc_f32_e32 v1, s4 -; SI-NEXT: v_trunc_f32_e32 v2, s7 -; SI-NEXT: v_trunc_f32_e32 v3, s6 +; SI-NEXT: v_trunc_f32_e32 v0, s1 +; SI-NEXT: v_trunc_f32_e32 v1, s0 +; SI-NEXT: v_trunc_f32_e32 v2, s3 +; SI-NEXT: v_trunc_f32_e32 v3, s2 ; SI-NEXT: v_mul_f32_e64 v4, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; SI-NEXT: v_mul_f32_e64 v6, |v1|, s8 @@ -503,14 +503,14 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % ; SI-NEXT: v_subb_u32_e32 v7, vcc, v12, v9, vcc ; SI-NEXT: v_sub_i32_e32 v4, vcc, v13, v11 ; SI-NEXT: v_subb_u32_e32 v5, vcc, v8, v11, vcc -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 ; VI-NEXT: s_mov_b32 s9, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -737,8 +737,8 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> % define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -749,8 +749,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -787,8 +787,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -799,8 +799,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -838,8 +838,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_sint_f32_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,12 +849,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) ; ; VI-LABEL: fp_to_sint_f32_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index c6b4e129bacbe2..5abf82aa1aab59 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -8,8 +8,8 @@ declare float @llvm.fabs.f32(float) #1 define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %in) { ; SI-LABEL: fp_to_uint_f32_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,12 +19,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % ; ; VI-LABEL: fp_to_uint_f32_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -47,7 +47,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float % define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -60,7 +60,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +92,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -152,34 +152,34 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x) { ; SI-LABEL: fp_to_uint_f32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s5, 0xcf800000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s1, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s4 +; SI-NEXT: v_trunc_f32_e32 v0, s0 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_floor_f32_e32 v2, v1 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v2 -; SI-NEXT: v_fma_f32 v0, v2, s5, v0 +; SI-NEXT: v_fma_f32 v0, v2, s1, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_f32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s2, 0xcf800000 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xcf800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s4 +; VI-NEXT: v_trunc_f32_e32 v0, s2 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_floor_f32_e32 v2, v1 -; VI-NEXT: v_fma_f32 v0, v2, s2, v0 +; VI-NEXT: v_fma_f32 v0, v2, s3, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v2 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -240,7 +240,7 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 @@ -264,7 +264,7 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -376,16 +376,16 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x float> %x) { ; SI-LABEL: fp_to_uint_v4f32_to_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s5 -; SI-NEXT: v_trunc_f32_e32 v2, s4 -; SI-NEXT: v_trunc_f32_e32 v4, s7 -; SI-NEXT: v_trunc_f32_e32 v6, s6 +; SI-NEXT: v_trunc_f32_e32 v0, s1 +; SI-NEXT: v_trunc_f32_e32 v2, s0 +; SI-NEXT: v_trunc_f32_e32 v4, s3 +; SI-NEXT: v_trunc_f32_e32 v6, s2 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 @@ -406,14 +406,14 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x ; SI-NEXT: v_cvt_u32_f32_e32 v0, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v9 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v4f32_to_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -631,8 +631,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in ; ; VI-LABEL: fp_to_uint_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -669,8 +669,8 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_fabs_f32_to_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -681,8 +681,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa ; ; VI-LABEL: fp_to_uint_fabs_f32_to_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %in) #0 { ; SI-LABEL: fp_to_uint_f32_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,12 +731,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i ; ; VI-LABEL: fp_to_uint_f32_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v0, s4 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 8c6dc4395839c0..82c25c01b17792 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; ; GFX89-LABEL: fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; ; GFX11-LABEL: fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fpext_f16_to_f64( ; SI-LABEL: fpext_f16_to_f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; ; GFX89-LABEL: fpext_f16_to_f64: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; ; GFX11-LABEL: fpext_f16_to_f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -141,7 +141,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; SI-LABEL: fpext_v2f16_to_v2f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -161,7 +161,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; ; GFX89-LABEL: fpext_v2f16_to_v2f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; ; GFX11-LABEL: fpext_v2f16_to_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -212,7 +212,7 @@ entry: define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; SI-LABEL: fpext_v2f16_to_v2f64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -234,7 +234,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; ; GFX89-LABEL: fpext_v2f16_to_v2f64: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -255,7 +255,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; ; GFX11-LABEL: fpext_v2f16_to_v2f64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -290,35 +290,46 @@ entry: define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) { ; SI-LABEL: s_fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; GFX89-LABEL: s_fneg_fpext_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; VI-LABEL: s_fneg_fpext_f16_to_f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_fneg_fpext_f16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -334,7 +345,7 @@ entry: define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; SI-LABEL: fneg_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -352,7 +363,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -370,7 +381,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -400,7 +411,7 @@ entry: define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; SI-LABEL: fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -418,7 +429,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -436,7 +447,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -466,7 +477,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; SI-LABEL: fneg_fabs_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +495,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -502,7 +513,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -535,7 +546,7 @@ entry: define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -557,7 +568,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -579,7 +590,7 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -614,7 +625,7 @@ entry: define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -638,7 +649,7 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -660,7 +671,7 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -696,7 +707,7 @@ entry: define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -718,7 +729,7 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -740,7 +751,7 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -775,7 +786,7 @@ entry: define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -799,7 +810,7 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -821,7 +832,7 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -857,7 +868,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -879,7 +890,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -901,7 +912,7 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -937,7 +948,7 @@ entry: define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -961,7 +972,7 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -983,7 +994,7 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; ; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1020,6 +1031,3 @@ entry: declare half @llvm.fabs.f16(half) #1 attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX9: {{.*}} -; VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 0e12cca1900ce6..238010ec05e4db 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; SI-LABEL: fptosi_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; VI-LABEL: fptosi_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; ; GFX11-LABEL: fptosi_f16_to_i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i32( ; SI-LABEL: fptosi_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; VI-LABEL: fptosi_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; ; GFX11-LABEL: fptosi_f16_to_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i64( ; SI-LABEL: fptosi_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -164,7 +164,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; VI-LABEL: fptosi_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -184,7 +184,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; ; GFX11-LABEL: fptosi_f16_to_i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -216,7 +216,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; SI-LABEL: fptosi_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; VI-LABEL: fptosi_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -261,7 +261,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -296,7 +296,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; SI-LABEL: fptosi_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -318,7 +318,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; VI-LABEL: fptosi_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -377,7 +377,7 @@ entry: define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; SI-LABEL: fptosi_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; VI-LABEL: fptosi_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -424,7 +424,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; ; GFX11-LABEL: fptosi_v2f16_to_v2i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -462,8 +462,8 @@ entry: define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptosi_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -475,8 +475,8 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; ; VI-LABEL: fptosi_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -488,11 +488,11 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fptosi_f16_to_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index abc5c7af13b0ce..1116dc9ae2e5b2 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; SI-LABEL: fptoui_f16_to_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; VI-LABEL: fptoui_f16_to_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; ; GFX11-LABEL: fptoui_f16_to_i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i32( ; SI-LABEL: fptoui_f16_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; VI-LABEL: fptoui_f16_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; ; GFX11-LABEL: fptoui_f16_to_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i64( ; SI-LABEL: fptoui_f16_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -164,7 +164,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; VI-LABEL: fptoui_f16_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -184,7 +184,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; ; GFX11-LABEL: fptoui_f16_to_i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -216,7 +216,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; SI-LABEL: fptoui_v2f16_to_v2i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -240,7 +240,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; VI-LABEL: fptoui_v2f16_to_v2i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -260,7 +260,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -295,7 +295,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; SI-LABEL: fptoui_v2f16_to_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -317,7 +317,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; VI-LABEL: fptoui_v2f16_to_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -338,7 +338,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -376,7 +376,7 @@ entry: define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; SI-LABEL: fptoui_v2f16_to_v2i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -400,7 +400,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; VI-LABEL: fptoui_v2f16_to_v2i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -423,7 +423,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; ; GFX11-LABEL: fptoui_v2f16_to_v2i64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -460,22 +460,21 @@ entry: define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; SI-LABEL: fptoui_f16_to_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 1.0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fptoui_f16_to_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -487,11 +486,11 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-LABEL: fptoui_f16_to_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 65ac2e240469de..6cc7368eeae616 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -41,7 +41,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -59,7 +59,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -71,7 +71,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -89,7 +89,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -101,7 +101,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -144,7 +144,7 @@ entry: define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-SDAG-LABEL: fptrunc_f64_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -163,7 +163,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; SI-GISEL-LABEL: fptrunc_f64_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -176,7 +176,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -195,7 +195,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; VI-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -208,7 +208,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -227,7 +227,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -240,7 +240,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -262,7 +262,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -287,7 +287,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -308,7 +308,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -323,7 +323,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -343,7 +343,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -358,7 +358,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -378,7 +378,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -415,7 +415,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -441,7 +441,7 @@ entry: define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -481,7 +481,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -503,7 +503,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -519,7 +519,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -541,7 +541,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 @@ -557,7 +557,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -584,7 +584,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -613,7 +613,7 @@ entry: define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -631,7 +631,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -643,7 +643,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -661,7 +661,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -673,7 +673,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -691,7 +691,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -703,7 +703,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -723,7 +723,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -747,7 +747,7 @@ entry: define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -765,7 +765,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -777,7 +777,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -795,7 +795,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -807,7 +807,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -825,7 +825,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -837,7 +837,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -881,7 +881,7 @@ entry: define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -899,7 +899,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; SI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -911,7 +911,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -929,7 +929,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; VI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -941,7 +941,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -959,7 +959,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -971,7 +971,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -991,7 +991,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1016,7 +1016,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1034,7 +1034,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1106,7 +1106,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1155,7 +1155,7 @@ entry: define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; SI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; SI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1185,7 +1185,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; VI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1245,7 +1245,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1267,7 +1267,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 @@ -1295,7 +1295,7 @@ entry: define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; SI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; SI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; SI-GISEL: ; %bb.0: ; %entry -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -1327,7 +1327,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; VI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-GISEL: ; %bb.0: ; %entry -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1378,7 +1378,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 @@ -1391,7 +1391,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 @@ -1413,7 +1413,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 1ba5e8f916cbaa..e4aa4d1d3ddb55 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-SDAG-LABEL: fptrunc_f64_to_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s6, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; VI-GISEL-LABEL: fptrunc_f64_to_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-GISEL-NEXT: s_mov_b32 s2, -1 @@ -50,7 +50,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -60,7 +60,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX10-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 @@ -70,7 +70,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -82,7 +82,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 @@ -99,7 +99,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -159,7 +159,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-SDAG: ; %bb.0: -; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-SAFE-GISEL: ; %bb.0: -; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 ; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8 @@ -270,7 +270,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-SDAG: ; %bb.0: -; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 @@ -281,7 +281,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-GISEL: ; %bb.0: -; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -292,7 +292,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-SDAG: ; %bb.0: -; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -348,7 +348,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-GISEL: ; %bb.0: -; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -400,7 +400,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-SDAG: ; %bb.0: -; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -411,7 +411,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-GISEL: ; %bb.0: -; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -422,7 +422,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-SDAG: ; %bb.0: -; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff ; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 @@ -489,7 +489,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-GISEL: ; %bb.0: -; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff ; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 @@ -548,7 +548,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-SDAG: ; %bb.0: -; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 @@ -562,7 +562,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; ; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-UNSAFE-GISEL: ; %bb.0: -; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 @@ -582,8 +582,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x double> %in) { ; SI-LABEL: fptrunc_v2f64_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +594,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -606,8 +606,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -619,8 +619,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -632,8 +632,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -645,8 +645,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -660,8 +660,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -679,37 +679,37 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) { ; SI-LABEL: fptrunc_v3f64_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x15 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x54 -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] ; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -721,25 +721,24 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; ; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x54 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; GFX10-SDAG-NEXT: s_clause 0x2 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -751,17 +750,16 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; ; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 +; GFX11-SDAG-NEXT: s_clause 0x2 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -770,8 +768,8 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -790,8 +788,8 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fptrunc_v4f64_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,8 +802,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; ; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -818,8 +816,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; ; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -833,8 +831,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -848,8 +846,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -863,8 +861,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -880,8 +878,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -901,8 +899,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) { ; SI-LABEL: fptrunc_v8f64_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -920,8 +918,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; ; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; VI-SDAG-NEXT: s_mov_b32 s2, -1 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -939,8 +937,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; ; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 ; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -959,8 +957,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -979,8 +977,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -999,8 +997,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1022,8 +1020,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do ; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 7c5d73ab66b47a..0d59021b69019f 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -51,8 +51,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -92,8 +92,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -120,12 +120,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -139,13 +139,13 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -159,8 +159,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -184,8 +184,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -218,8 +218,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -247,8 +247,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -276,8 +276,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -299,12 +299,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -316,13 +316,13 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -334,8 +334,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -356,8 +356,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -387,8 +387,8 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -416,8 +416,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -445,8 +445,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 8 @@ -468,12 +468,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v3, v2 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -485,13 +485,13 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f16_e32 v3, v2 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 @@ -503,8 +503,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -525,8 +525,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -556,8 +556,8 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -592,8 +592,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -628,8 +628,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -662,12 +662,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 @@ -690,13 +690,13 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 @@ -719,8 +719,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -756,8 +756,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -802,8 +802,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -827,8 +827,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -852,8 +852,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -875,12 +875,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -892,13 +892,13 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -910,8 +910,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -932,8 +932,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -963,8 +963,8 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,8 +988,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1013,8 +1013,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1036,12 +1036,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f32_e32 v3, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -1053,13 +1053,13 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -1071,8 +1071,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1093,8 +1093,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1124,8 +1124,8 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1182,8 +1182,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; CI-LABEL: frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1217,8 +1217,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1248,12 +1248,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; ; GFX9-LABEL: frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1275,13 +1275,13 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-LABEL: frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -1302,8 +1302,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-LABEL: frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1337,8 +1337,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-LABEL: frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v12, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1379,8 +1379,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1430,8 +1430,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: fast_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1461,8 +1461,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: fast_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1488,12 +1488,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: fast_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1511,13 +1511,13 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: fast_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1535,8 +1535,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-LABEL: fast_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1566,8 +1566,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-LABEL: fast_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1604,8 +1604,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1655,8 +1655,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: unsafe_frem_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_mov_b32 s2, s10 @@ -1686,8 +1686,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: unsafe_frem_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1713,12 +1713,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: unsafe_frem_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1736,13 +1736,13 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX10-LABEL: unsafe_frem_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1760,8 +1760,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-LABEL: unsafe_frem_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1791,8 +1791,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-LABEL: unsafe_frem_f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -1829,8 +1829,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1892,8 +1892,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -1955,8 +1955,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1995,12 +1995,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 @@ -2023,13 +2023,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 @@ -2052,8 +2052,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2090,8 +2090,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2139,8 +2139,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2242,8 +2242,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -2345,8 +2345,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2405,12 +2405,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 @@ -2448,13 +2448,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5 @@ -2492,8 +2492,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2552,8 +2552,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v4f16: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2625,8 +2625,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2676,8 +2676,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -2727,8 +2727,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2776,12 +2776,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 @@ -2819,13 +2819,13 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 @@ -2863,8 +2863,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2921,8 +2921,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -2989,8 +2989,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3070,8 +3070,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -3151,8 +3151,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -3230,12 +3230,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 @@ -3303,13 +3303,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 ; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 @@ -3377,8 +3377,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3477,8 +3477,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v4f32: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v8, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 @@ -3589,8 +3589,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3681,8 +3681,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s10, s2 @@ -3730,8 +3730,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -3777,12 +3777,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: frem_v2f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -3818,13 +3818,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: frem_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] @@ -3858,8 +3858,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-LABEL: frem_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3912,8 +3912,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-LABEL: frem_v2f64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX1150-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX1150-NEXT: v_mov_b32_e32 v16, 0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index ea588df86b8467..4ea3323a9dbfc7 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -13,49 +13,51 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind rea define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: s_lshr_b32 s5, s4, 1 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_not_b32 s4, s6 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_alignbit_b32 v0, s5, v0, v1 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_not_b32 s5, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_not_b32 s3, s6 -; VI-NEXT: s_lshr_b32 s2, s4, 1 -; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_lshr_b32 s1, s6, 1 +; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_not_b32 s3, s6 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_not_b32 s1, s2 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -75,30 +77,30 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshl_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX10-NEXT: s_lshr_b32 s0, s6, 1 +; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, 1 -; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: s_not_b32 s3, s6 +; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX11-NEXT: s_lshr_b32 s1, s6, 1 +; GFX11-NEXT: s_not_b32 s0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -111,7 +113,7 @@ entry: define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +126,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 @@ -135,7 +137,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -157,7 +159,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -166,7 +168,7 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 @@ -183,15 +185,15 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: s_lshr_b32 s2, s5, 1 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 @@ -206,47 +208,47 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: s_not_b32 s1, s1 +; VI-NEXT: s_not_b32 s3, s3 ; VI-NEXT: s_lshr_b32 s7, s5, 1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_not_b32 s2, s2 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; VI-NEXT: s_lshr_b32 s1, s4, 1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 +; VI-NEXT: s_lshr_b32 s3, s4, 1 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: s_not_b32 s3, s9 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: s_not_b32 s1, s9 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_not_b32 s3, s8 +; GFX9-NEXT: s_not_b32 s1, s8 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -270,39 +272,39 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 -; GFX10-NEXT: s_lshr_b32 s2, s5, 1 -; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshr_b32 s0, s5, 1 +; GFX10-NEXT: s_not_b32 s1, s3 ; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v1, s2, v0, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s0 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 ; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 ; GFX11-NEXT: s_lshr_b32 s5, s5, 1 -; GFX11-NEXT: s_not_b32 s1, s1 +; GFX11-NEXT: s_not_b32 s3, s3 ; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -315,8 +317,8 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -329,8 +331,8 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -343,15 +345,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -371,20 +373,20 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 @@ -402,44 +404,44 @@ entry: define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s1, s19 ; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: s_not_b32 s11, s15 ; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v3, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s7, s7, 1 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_not_b32 s1, s18 +; SI-NEXT: s_not_b32 s7, s14 ; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s6, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s6, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_not_b32 s1, s17 +; SI-NEXT: s_not_b32 s6, s13 ; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s5, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s5, s5, 1 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_not_b32 s1, s16 +; SI-NEXT: s_not_b32 s5, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s4, 1 -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: s_lshr_b32 s4, s4, 1 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_not_b32 s3, s15 @@ -472,36 +474,36 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshl_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_not_b32 s3, s15 +; GFX9-NEXT: s_not_b32 s1, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 -; GFX9-NEXT: s_lshr_b32 s2, s7, 1 +; GFX9-NEXT: s_lshr_b32 s0, s7, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: s_not_b32 s3, s14 +; GFX9-NEXT: s_not_b32 s1, s14 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v2, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: s_not_b32 s3, s13 +; GFX9-NEXT: s_not_b32 s1, s13 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 +; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_not_b32 s3, s12 +; GFX9-NEXT: s_not_b32 s1, s12 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_alignbit_b32 v0, s2, v0, v5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -532,11 +534,11 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX10-LABEL: fshl_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 @@ -560,9 +562,9 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-LABEL: fshl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 @@ -594,10 +596,10 @@ entry: define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshl_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -607,13 +609,13 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -630,9 +632,9 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshl_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -666,22 +668,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshl_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 @@ -702,7 +704,7 @@ entry: define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: orxor2or1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -718,7 +720,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; VI-LABEL: orxor2or1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 @@ -732,7 +734,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s4, s2, 7 @@ -759,7 +761,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s4, s2, 7 @@ -772,7 +774,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX11-LABEL: orxor2or1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index dbcebe6e07e3fe..e8310e73f9a475 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -22,40 +22,42 @@ declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -72,24 +74,24 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s5, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s4, s5, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -103,7 +105,7 @@ entry: define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -116,7 +118,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 @@ -127,7 +129,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -149,7 +151,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -158,7 +160,7 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 @@ -175,9 +177,9 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -192,33 +194,33 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm @@ -240,13 +242,13 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] @@ -255,16 +257,16 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-LABEL: fshr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -277,8 +279,8 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -291,8 +293,8 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -305,15 +307,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -333,20 +335,20 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 @@ -364,11 +366,11 @@ entry: define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s19, 0xf000 -; SI-NEXT: s_mov_b32 s18, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s15 @@ -382,14 +384,14 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v4, s12 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s15 @@ -410,10 +412,10 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX9-LABEL: fshr_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 @@ -451,9 +453,9 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x54 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s15 @@ -464,15 +466,15 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[12:15], s[2:3], 0x54 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 @@ -496,10 +498,10 @@ entry: define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { ; SI-LABEL: fshr_v4i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v1, s10 @@ -509,13 +511,13 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v4i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -532,9 +534,9 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; ; GFX9-LABEL: fshr_v4i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 @@ -566,22 +568,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-LABEL: fshr_v4i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v4i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 8fd201038ad160..8779bb0df0f711 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -3975,15 +3975,15 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) { define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_neg0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4005,17 +4005,18 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_neg0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4031,25 +4032,24 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_neg0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4063,24 +4063,22 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_neg0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4095,8 +4093,9 @@ define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4110,15 +4109,15 @@ entry: define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_pos0: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4140,17 +4139,18 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; ; GISEL-IEEE-LABEL: elim_redun_check_pos0: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4165,25 +4165,24 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s6, 0 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_pos0: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4197,24 +4196,22 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_pos0: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4228,8 +4225,9 @@ define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %i ; GISEL-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s4, 0 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e64 vcc, s2, 0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4243,15 +4241,15 @@ entry: define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) { ; SDAG-IEEE-LABEL: elim_redun_check_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dword s0, s[2:3], 0xb -; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; SDAG-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 ; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 @@ -4273,17 +4271,18 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; ; GISEL-IEEE-LABEL: elim_redun_check_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dword s6, s[2:3], 0xb -; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dword s2, s[0:1], 0xb +; GISEL-IEEE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-IEEE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v1, v0 +; GISEL-IEEE-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-IEEE-NEXT: v_add_i32_e64 v2, s[0:1], -1, v1 ; GISEL-IEEE-NEXT: v_fma_f32 v3, -v2, v1, v0 ; GISEL-IEEE-NEXT: v_add_i32_e64 v4, s[0:1], 1, v1 @@ -4299,25 +4298,24 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-IEEE-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s6, v1 +; GISEL-IEEE-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 ; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-IEEE-NEXT: s_mov_b32 s6, -1 ; GISEL-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GISEL-IEEE-NEXT: s_endpgm ; ; SDAG-DAZ-LABEL: elim_redun_check_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dword s0, s[2:3], 0xb +; SDAG-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s0, v1 -; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; SDAG-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SDAG-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; SDAG-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SDAG-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; SDAG-DAZ-NEXT: s_mov_b32 s2, -1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -4331,24 +4329,22 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x260 ; SDAG-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SDAG-DAZ-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SDAG-DAZ-NEXT: s_endpgm ; ; GISEL-DAZ-LABEL: elim_redun_check_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dword s4, s[2:3], 0xb -; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dword s2, s[0:1], 0xb ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 +; GISEL-DAZ-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s4 -; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s4, v1 -; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, s2, v1 +; GISEL-DAZ-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v0 -; GISEL-DAZ-NEXT: s_mov_b32 s3, 0xf000 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v2, v0, v1 ; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GISEL-DAZ-NEXT: v_fma_f32 v3, -v1, v2, 0.5 @@ -4363,8 +4359,9 @@ define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-DAZ-NEXT: v_bfrev_b32_e32 v1, 1 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s4, v1 +; GISEL-DAZ-NEXT: v_cmp_nge_f32_e32 vcc, s2, v1 ; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-DAZ-NEXT: s_mov_b32 s2, -1 ; GISEL-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-DAZ-NEXT: s_endpgm entry: @@ -4378,7 +4375,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4426,7 +4423,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-IEEE-LABEL: elim_redun_check_v2: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4478,7 +4475,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; SDAG-DAZ-LABEL: elim_redun_check_v2: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4524,7 +4521,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; ; GISEL-DAZ-LABEL: elim_redun_check_v2: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) @@ -4582,7 +4579,7 @@ entry: define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 @@ -4630,7 +4627,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: ; GISEL-IEEE: ; %bb.0: ; %entry -; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-IEEE-NEXT: s_waitcnt lgkmcnt(0) @@ -4682,7 +4679,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; SDAG-DAZ-LABEL: elim_redun_check_v2_ult: ; SDAG-DAZ: ; %bb.0: ; %entry -; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SDAG-DAZ-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SDAG-DAZ-NEXT: s_mov_b32 s7, 0xf000 @@ -4728,7 +4725,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; ; GISEL-DAZ-LABEL: elim_redun_check_v2_ult: ; GISEL-DAZ: ; %bb.0: ; %entry -; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GISEL-DAZ-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GISEL-DAZ-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GISEL-DAZ-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index f6df1cbbdd06b8..f72d4e0e03633c 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -7,58 +7,58 @@ define amdgpu_kernel void @fsub_f16( ; SI-LABEL: fsub_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX89-LABEL: fsub_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX89-NEXT: s_mov_b32 s11, 0xf000 -; GFX89-NEXT: s_mov_b32 s10, -1 -; GFX89-NEXT: s_mov_b32 s14, s10 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_mov_b32 s14, s2 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s12, s6 ; GFX89-NEXT: s_mov_b32 s13, s7 -; GFX89-NEXT: s_mov_b32 s15, s11 -; GFX89-NEXT: s_mov_b32 s2, s10 -; GFX89-NEXT: s_mov_b32 s3, s11 +; GFX89-NEXT: s_mov_b32 s15, s3 +; GFX89-NEXT: s_mov_b32 s10, s2 +; GFX89-NEXT: s_mov_b32 s11, s3 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s4 -; GFX89-NEXT: s_mov_b32 s9, s5 +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 ; GFX89-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -93,7 +93,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_a( ; SI-LABEL: fsub_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; ; GFX89-LABEL: fsub_f16_imm_a: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @fsub_f16_imm_a( ; ; GFX11-LABEL: fsub_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -160,7 +160,7 @@ entry: define amdgpu_kernel void @fsub_f16_imm_b( ; SI-LABEL: fsub_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; ; GFX89-LABEL: fsub_f16_imm_b: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -198,7 +198,7 @@ define amdgpu_kernel void @fsub_f16_imm_b( ; ; GFX11-LABEL: fsub_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -227,21 +227,21 @@ entry: define amdgpu_kernel void @fsub_v2f16( ; SI-LABEL: fsub_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -256,60 +256,60 @@ define amdgpu_kernel void @fsub_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fsub_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fsub_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: fsub_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -343,7 +343,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_a( ; SI-LABEL: fsub_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; VI-LABEL: fsub_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -390,7 +390,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX9-LABEL: fsub_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -409,7 +409,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_a( ; ; GFX11-LABEL: fsub_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -438,7 +438,7 @@ entry: define amdgpu_kernel void @fsub_v2f16_imm_b( ; SI-LABEL: fsub_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -464,7 +464,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; VI-LABEL: fsub_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -485,7 +485,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX9-LABEL: fsub_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -504,7 +504,7 @@ define amdgpu_kernel void @fsub_v2f16_imm_b( ; ; GFX11-LABEL: fsub_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 0a7070cf54542e..bdaee6eb1ea64c 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -6,7 +6,7 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i1_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s6, 1 +; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -28,7 +28,7 @@ define void @void_func_i8_inreg(i8 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i8_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @void_func_i16_inreg(i16 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -66,7 +66,7 @@ define void @void_func_i32_inreg(i32 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -85,8 +85,8 @@ define void @void_func_i64_inreg(i64 inreg %arg0) #0 { ; GFX9-LABEL: void_func_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -105,7 +105,7 @@ define void @void_func_f16_inreg(half inreg %arg0) #0 { ; GFX9-LABEL: void_func_f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -124,7 +124,7 @@ define void @void_func_f32_inreg(float inreg %arg0) #0 { ; GFX9-LABEL: void_func_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -143,8 +143,8 @@ define void @void_func_f64_inreg(double inreg %arg0) #0 { ; GFX9-LABEL: void_func_f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -163,7 +163,7 @@ define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -182,9 +182,9 @@ define void @void_func_v3i16_inreg(<3 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -214,8 +214,8 @@ define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -234,10 +234,10 @@ define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_store_short v[0:1], v0, off +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -259,10 +259,10 @@ define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -282,8 +282,8 @@ define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -302,9 +302,9 @@ define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -324,10 +324,10 @@ define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -347,12 +347,12 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -360,7 +360,7 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_clause 0x1 @@ -375,16 +375,16 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -392,8 +392,8 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -437,28 +437,28 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -466,12 +466,12 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -488,33 +488,47 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v32i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-NEXT: v_mov_b32_e32 v5, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -522,18 +536,24 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v32i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -551,10 +571,10 @@ define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -574,13 +594,13 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -588,7 +608,7 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -603,16 +623,16 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -620,8 +640,8 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -636,20 +656,20 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v5i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -657,11 +677,11 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v5i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off @@ -675,28 +695,28 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -704,12 +724,12 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -726,33 +746,47 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-NEXT: v_mov_b32_e32 v5, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -760,18 +794,24 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16i64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -789,7 +829,7 @@ define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -808,9 +848,9 @@ define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -831,8 +871,8 @@ define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -851,10 +891,10 @@ define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -874,16 +914,16 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -891,8 +931,8 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -907,8 +947,8 @@ define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -927,9 +967,9 @@ define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dwordx3 v[0:1], v[0:2], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -949,10 +989,10 @@ define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -972,16 +1012,16 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -989,8 +1029,8 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1005,28 +1045,28 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1034,12 +1074,12 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1056,10 +1096,10 @@ define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1079,13 +1119,13 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1093,7 +1133,7 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v3f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1108,16 +1148,16 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1125,8 +1165,8 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v4f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1141,28 +1181,28 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1170,12 +1210,12 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v8f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX11-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 ; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 ; GFX11-NEXT: s_clause 0x3 @@ -1192,33 +1232,47 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16f64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-NEXT: v_mov_b32_e32 v5, s29 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1226,18 +1280,24 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16f64_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29 +; GFX11-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v7, s27 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 -; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 -; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17 +; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off +; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; GFX11-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 ; GFX11-NEXT: s_clause 0x4 @@ -1255,86 +1315,104 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr ; GFX9-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_byte v[0:1], v17, off +; GFX9-NEXT: global_store_byte v[0:1], v7, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v18, off +; GFX9-NEXT: global_store_short v[0:1], v8, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v[0:1], v19, off +; GFX9-NEXT: global_store_short v[0:1], v9, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 -; GFX11-NEXT: v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29 +; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21 +; GFX11-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23 +; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s23 -; GFX11-NEXT: v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25 -; GFX11-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s19 -; GFX11-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v7, s21 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 -; GFX11-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v11, s17 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 -; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; GFX11-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, s15 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX11-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v15, s5 +; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 +; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b8 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b8 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b8 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v14, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b16 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef @@ -1349,76 +1427,94 @@ define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32 ; GFX9-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GFX9-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s29 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[10:13], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s24 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s20 +; GFX9-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s26 -; GFX9-NEXT: v_mov_b32_e32 v1, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s29 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[16:17], off +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[6:7], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[18:19], off +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29 +; GFX11-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21 +; GFX11-NEXT: v_dual_mov_b32 v16, s22 :: v_dual_mov_b32 v17, s23 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX11-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v3, s29 -; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 -; GFX11-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25 +; GFX11-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v7, s17 ; GFX11-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v9, s19 -; GFX11-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21 -; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 -; GFX11-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 -; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 -; GFX11-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 -; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off dlc +; GFX11-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13 +; GFX11-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s15 +; GFX11-NEXT: v_dual_mov_b32 v14, s8 :: v_dual_mov_b32 v15, s9 +; GFX11-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v17, s11 +; GFX11-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[4:7], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[8:11], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[16:19], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[18:21], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b128 v[0:1], v[20:23], off dlc +; GFX11-NEXT: global_store_b128 v[0:1], v[22:25], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v[0:1], v[12:13], off dlc +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v[0:1], v[14:15], off dlc +; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef @@ -1431,147 +1527,156 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX9-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s6 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s7 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s16 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s17 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s18 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s21 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s22 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s23 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s24 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s25 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s26 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s27 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s28 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v16, s29 -; GFX9-NEXT: global_store_dword v[0:1], v16, off +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: v_mov_b32_e32 v6, s23 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: v_mov_b32_e32 v6, s24 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v5, off +; GFX9-NEXT: v_mov_b32_e32 v6, s25 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, s26 ; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v7, off +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v8, off +; GFX9-NEXT: v_mov_b32_e32 v6, s28 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v9, off +; GFX9-NEXT: v_mov_b32_e32 v6, s29 +; GFX9-NEXT: global_store_dword v[0:1], v6, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v10, off +; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v11, off +; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v12, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v13, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v14, off +; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v15, off +; GFX9-NEXT: global_store_dword v[0:1], v5, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: too_many_args_use_workitem_id_x_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v13, s1 -; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v15, s3 -; GFX11-NEXT: v_mov_b32_e32 v16, s6 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v13, s16 :: v_dual_mov_b32 v12, s7 -; GFX11-NEXT: v_dual_mov_b32 v15, s18 :: v_dual_mov_b32 v14, s17 -; GFX11-NEXT: v_mov_b32_e32 v16, s19 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v2, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v4, s7 +; GFX11-NEXT: v_mov_b32_e32 v6, s9 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v12, s20 -; GFX11-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v14, s22 -; GFX11-NEXT: v_mov_b32_e32 v16, s24 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v2, s10 +; GFX11-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s12 +; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v12, s25 -; GFX11-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 -; GFX11-NEXT: v_mov_b32_e32 v16, s29 -; GFX11-NEXT: global_store_b32 v[0:1], v12, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v13, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off dlc +; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, s16 +; GFX11-NEXT: v_dual_mov_b32 v4, s17 :: v_dual_mov_b32 v5, s18 +; GFX11-NEXT: v_mov_b32_e32 v6, s19 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v15, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v16, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21 +; GFX11-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23 +; GFX11-NEXT: v_mov_b32_e32 v6, s24 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc @@ -1582,15 +1687,22 @@ define void @too_many_args_use_workitem_id_x_inreg( ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v7, off dlc +; GFX11-NEXT: v_dual_mov_b32 v5, s28 :: v_dual_mov_b32 v2, s25 +; GFX11-NEXT: v_dual_mov_b32 v3, s26 :: v_dual_mov_b32 v4, s27 +; GFX11-NEXT: v_mov_b32_e32 v6, s29 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v8, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v9, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v6, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v10, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v11, off dlc +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, @@ -1643,10 +1755,10 @@ define void @void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg ; GFX9-LABEL: void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1669,24 +1781,24 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX9-LABEL: caller_void_func_i32_v2float_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s17, s33 +; GFX9-NEXT: s_mov_b32 s7, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[18:19] -; GFX9-NEXT: v_writelane_b32 v40, s17, 2 +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: v_writelane_b32 v40, s7, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 -; GFX9-NEXT: s_mov_b32 s2, s16 -; GFX9-NEXT: s_mov_b32 s1, s7 -; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; GFX9-NEXT: s_mov_b32 s2, s6 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 @@ -1703,19 +1815,19 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s16, -1 +; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s16 +; GFX11-NEXT: s_mov_b32 exec_lo, s4 ; GFX11-NEXT: v_writelane_b32 v40, s3, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_getpc_b64 s[16:17] -; GFX11-NEXT: s_add_u32 s16, s16, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s17, s17, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[16:17], s[16:17], 0x0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, caller_void_func_i32_v2float_inreg@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, caller_void_func_i32_v2float_inreg@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 @@ -1761,7 +1873,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 { ; GFX9-LABEL: void_func_bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1780,7 +1892,7 @@ define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v2bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1799,9 +1911,9 @@ define void @void_func_v3bf16_inreg(<3 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v3bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: global_store_short v[0:1], v0, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1822,8 +1934,8 @@ define void @void_func_v4bf16_inreg(<4 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v4bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1842,10 +1954,10 @@ define void @void_func_v8bf16_inreg(<8 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v8bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1865,16 +1977,16 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX9-LABEL: void_func_v16bf16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s21 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1882,8 +1994,8 @@ define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 { ; GFX11-LABEL: void_func_v16bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 ; GFX11-NEXT: s_clause 0x1 @@ -1898,10 +2010,10 @@ define void @void_func_2_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1924,10 +2036,10 @@ define void @void_func_2_i64_inreg(i64 inreg %arg0, i64 inreg %arg1, ptr addrspa ; GFX9-LABEL: void_func_2_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -1953,13 +2065,13 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX9-LABEL: void_func_i64_inreg_i32_inreg_i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v3, s18 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -1970,7 +2082,7 @@ define void @void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s1 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1989,19 +2101,19 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX9-LABEL: void_func_5_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2011,7 +2123,7 @@ define void @void_func_5_i32_inreg(i32 inreg %arg0, i32 inreg %arg1, i32 inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 -; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc @@ -2035,12 +2147,12 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX9-LABEL: void_func_a5i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:16 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2048,7 +2160,7 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) ; GFX11-LABEL: void_func_a5i32_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_clause 0x1 @@ -2181,28 +2293,28 @@ define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addr ; GFX9-LABEL: void_func_a16i32_inreg__noimplicit: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s29 -; GFX9-NEXT: v_mov_b32_e32 v4, s28 -; GFX9-NEXT: v_mov_b32_e32 v3, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s25 -; GFX9-NEXT: v_mov_b32_e32 v4, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2210,12 +2322,12 @@ define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addr ; GFX11-LABEL: void_func_a16i32_inreg__noimplicit: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v4, s24 -; GFX11-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22 -; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20 -; GFX11-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18 -; GFX11-NEXT: v_dual_mov_b32 v13, s17 :: v_dual_mov_b32 v12, s16 -; GFX11-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX11-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14 +; GFX11-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12 +; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10 +; GFX11-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8 +; GFX11-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX11-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2 ; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0 ; GFX11-NEXT: s_clause 0x3 diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll index 2491cc0d19d5a1..1853aa9303095e 100644 --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -31,7 +31,7 @@ bb: define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 @@ -61,7 +61,7 @@ bb: define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -89,7 +89,7 @@ bb: define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -122,7 +122,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -149,7 +149,7 @@ bb: define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: divergent_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] @@ -180,7 +180,7 @@ bb: define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -205,7 +205,7 @@ bb: define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_or3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -232,7 +232,7 @@ bb: define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -257,7 +257,7 @@ bb: define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_and3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -284,7 +284,7 @@ bb: define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b32: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -309,7 +309,7 @@ bb: define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) { ; GCN-LABEL: uniform_xor3_b64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll index 1feae4dae6a09e..1a9334706cb927 100644 --- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -106,7 +106,7 @@ define amdgpu_kernel void @gds_global_align_plus_attr(ptr addrspace(1) %out) #0 define amdgpu_kernel void @gds_extern_align(ptr addrspace(1) %out, ptr addrspace(2) %gds.arg) #0 { ; GCN-LABEL: gds_extern_align: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x8 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x8 ; GCN-NEXT: v_mov_b32_e32 v0, 5 ; GCN-NEXT: s_movk_i32 m0, 0x401 ; GCN-NEXT: s_movk_i32 s1, 0x400 diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index d70d45d44af0fd..944dcda5eba6f2 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -6,12 +6,12 @@ declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) # define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: IllegalGEPConst: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll index 0f951e89d37c8a..81239e841e097e 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -45,7 +45,7 @@ ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0 -define amdgpu_kernel void @minimal_kernel_inputs() #0 { +define amdgpu_kernel void @minimal_kernel_inputs() { %id = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %id, ptr addrspace(1) undef ret void @@ -74,7 +74,7 @@ define amdgpu_kernel void @minimal_kernel_inputs() #0 { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0 -define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { +define amdgpu_kernel void @minimal_kernel_inputs_with_stack() { %alloca = alloca i32, addrspace(5) %id = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %id, ptr addrspace(1) undef @@ -107,7 +107,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 2 -define amdgpu_kernel void @queue_ptr() #1 { +define amdgpu_kernel void @queue_ptr() { %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 %load = load volatile i8, ptr addrspace(4) %queue.ptr %id = call i32 @llvm.amdgcn.workgroup.id.x() @@ -154,7 +154,7 @@ define amdgpu_kernel void @queue_ptr() #1 { ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 -define amdgpu_kernel void @all_inputs() #2 { +define amdgpu_kernel void @all_inputs() { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -182,19 +182,16 @@ define amdgpu_kernel void @all_inputs() #2 { ret void } -declare i32 @llvm.amdgcn.workgroup.id.x() #3 -declare i32 @llvm.amdgcn.workgroup.id.y() #3 -declare i32 @llvm.amdgcn.workgroup.id.z() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #3 -declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3 -declare i64 @llvm.amdgcn.dispatch.id() #3 - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #1 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #2 = { "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 + +attributes #0 = { nounwind readnone speculatable willreturn } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll index fb402b5ba30d12..be6f8a4375163a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll @@ -4,29 +4,29 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GCN-NEXT: s_bcnt1_i32_b64 s1, s[6:7] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s8, s[2:3], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NEXT: v_add_f32_e32 v4, v5, v2 -; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[2:3] glc +; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[4:5] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v5 @@ -36,7 +36,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GCN-NEXT: ; %bb.3: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: .LBB0_4: ; %Flow2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: v_readfirstlane_b32 s0, v1 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -52,20 +52,20 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: .LBB1_2: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 5c4ded9a231e0d..dce80441601584 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -28,7 +28,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -37,7 +37,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -47,7 +47,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off @@ -71,7 +71,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc @@ -79,7 +79,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -101,7 +101,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -123,7 +123,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -151,7 +151,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -180,12 +180,12 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -199,7 +199,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -208,7 +208,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -218,7 +218,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -242,7 +242,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc @@ -250,7 +250,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -272,7 +272,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -295,7 +295,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -323,7 +323,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -353,12 +353,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -372,7 +372,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -381,7 +381,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -391,7 +391,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -415,7 +415,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc @@ -423,7 +423,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -445,7 +445,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -468,7 +468,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -501,7 +501,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -536,12 +536,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -555,7 +555,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -564,7 +564,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -574,7 +574,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off @@ -597,7 +597,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off @@ -605,7 +605,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off @@ -613,7 +613,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[0:1] @@ -634,7 +634,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -661,7 +661,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -689,12 +689,12 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -708,7 +708,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -717,7 +717,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -727,7 +727,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -750,7 +750,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 @@ -758,7 +758,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 @@ -766,7 +766,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -789,7 +789,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -816,7 +816,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -845,12 +845,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -864,7 +864,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -873,7 +873,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -883,7 +883,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 @@ -906,7 +906,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 @@ -914,7 +914,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 @@ -922,7 +922,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -945,7 +945,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -976,7 +976,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -1009,12 +1009,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1044,7 +1044,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1053,7 +1053,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1079,7 +1079,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1103,7 +1103,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1127,7 +1127,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1149,7 +1149,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1172,7 +1172,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1200,7 +1200,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1230,12 +1230,12 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1263,7 +1263,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc0 sc1 @@ -1272,7 +1272,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 @@ -1296,7 +1296,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -1319,7 +1319,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 @@ -1342,7 +1342,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -1363,7 +1363,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1386,7 +1386,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1413,7 +1413,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1442,16 +1442,12 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1460,34 +1456,34 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1495,7 +1491,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -1508,25 +1504,25 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1538,36 +1534,37 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1576,7 +1573,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1588,14 +1585,14 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1605,7 +1602,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1617,12 +1614,13 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1636,7 +1634,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -1645,7 +1643,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1655,7 +1653,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1679,7 +1677,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc @@ -1687,7 +1685,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1709,7 +1707,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1732,7 +1730,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1760,7 +1758,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1790,12 +1788,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 ret float %result } -define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1804,181 +1802,154 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB10_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, v0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 -; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: v_add_f32_e32 v5, v6, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, v5 -; GFX7-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB10_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, v0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: v_add_f32_e32 v5, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v5 -; GFX6-NEXT: v_mov_b32_e32 v1, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB10_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret float %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 + ret void } -define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1987,151 +1958,171 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB11_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB11_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2140,38 +2131,39 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc @@ -2179,115 +2171,131 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2296,235 +2304,211 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB13_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB13_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB13_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc @@ -2532,208 +2516,148 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB14_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB14_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB14_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + ret void } -define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 @@ -2756,51 +2680,23 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -2823,7 +2719,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -2850,7 +2746,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -2879,766 +2775,543 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64: +define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB16_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB16_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret void } -define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB17_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB17_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 + ret void } -define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB18_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB18_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v11, v1 -; GFX7-NEXT: v_mov_b32_e32 v10, v0 -; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v8 -; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 -; GFX7-NEXT: v_mov_b32_e32 v3, v11 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v1 -; GFX6-NEXT: v_mov_b32_e32 v10, v0 -; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, v8 -; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 -; GFX6-NEXT: v_mov_b32_e32 v3, v11 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1 @@ -3646,408 +3319,323 @@ define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB19_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB19_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1 @@ -4055,1977 +3643,1000 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v11, v7 -; GFX7-NEXT: v_mov_b32_e32 v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v9, v5 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v7, v9 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v11, v7 -; GFX6-NEXT: v_mov_b32_e32 v10, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v7, v9 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } -; -------------------------------------------------------------------- -; half -; -------------------------------------------------------------------- - -define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16: +define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX7-NEXT: v_not_b32_e32 v7, v2 ; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB22_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 -; GFX6-NEXT: v_not_b32_e32 v7, v2 ; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB22_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst - ret half %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB23_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB23_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst - ret half %result + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0 + ret void } -define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB24_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB24_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst - ret half %result - } + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} -define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB25_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB25_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_add_f32_e32 v5, v6, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-NEXT: v_mov_b32_e32 v1, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: v_add_f32_e32 v5, v6, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6040,91 +4651,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6137,453 +4688,266 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_e32 v0, v1, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 - ret half %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -6596,65 +4960,33 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -6667,462 +4999,312 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 -; GFX940-NEXT: v_mov_b32_e32 v1, v7 -; GFX940-NEXT: global_load_dword v4, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v4 -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v4, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v4 -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v4 -; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 -; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 -; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 -; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst - ret half %result -} + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} -define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 @@ -7135,66 +5317,26 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -7208,27 +5350,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -7241,61 +5373,39 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -7307,27 +5417,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -7340,1465 +5440,609 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX7-NEXT: v_not_b32_e32 v6, v2 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_not_b32_e32 v6, v2 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; bfloat -; -------------------------------------------------------------------- - -define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16: +define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst - ret bfloat %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %result } -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst - ret bfloat %result + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret void } -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst - ret bfloat %result - } + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret float %result +} -define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16: +define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v6, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v4 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8813,109 +6057,31 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v6, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v4, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX908-NEXT: v_not_b32_e32 v6, v3 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, v0 -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 -; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -8928,663 +6094,286 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret float %result } -define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB37_1 @@ -9592,1302 +6381,800 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f32_e32 v3, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v4, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void } -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 - ret bfloat %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 ; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 ; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX940-NEXT: v_not_b32_e32 v4, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v5 -; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX90A-NEXT: v_not_b32_e32 v4, v4 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v5 -; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX908-NEXT: global_load_dword v5, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v5 -; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v5 -; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v7, v4 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mov_b32_e32 v11, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, v0 +; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v7, v4 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v11, v1 +; GFX6-NEXT: v_mov_b32_e32 v10, v0 +; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v0, v8 +; GFX6-NEXT: v_mov_b32_e32 v1, v9 +; GFX6-NEXT: v_mov_b32_e32 v2, v10 +; GFX6-NEXT: v_mov_b32_e32 v3, v11 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst - ret bfloat %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v5, v5 ; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10896,90 +7183,31 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] -; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX940-NEXT: s_mov_b32 s0, 0xffff -; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX940-NEXT: v_not_b32_e32 v5, v5 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10988,38 +7216,23 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_not_b32_e32 v5, v5 ; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB41_1 @@ -11027,1377 +7240,1686 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX90A-NEXT: s_mov_b32 s4, 0xffff -; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX90A-NEXT: v_not_b32_e32 v5, v5 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX908-NEXT: s_mov_b32 s4, 0xffff -; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc -; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; <2 x half> -; -------------------------------------------------------------------- - -define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16: +define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v11, v7 +; GFX7-NEXT: v_mov_b32_e32 v10, v6 +; GFX7-NEXT: v_mov_b32_e32 v9, v5 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v8 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 ; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v11, v7 +; GFX6-NEXT: v_mov_b32_e32 v10, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, v5 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v6, v8 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v7, v9 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; -------------------------------------------------------------------- +; half +; -------------------------------------------------------------------- + +define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 +; GFX7-NEXT: v_not_b32_e32 v7, v2 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 +; GFX6-NEXT: v_not_b32_e32 v7, v2 ; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result + %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result } -define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16: +define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst - ret void + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result } -define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 ; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 ; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst - ret void -} + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result + } -define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -12411,17 +8933,26 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 ; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -12434,35 +8965,88 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -12475,246 +9059,10468 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_e32 v0, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret half %result +} + +define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_load_dword v4, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v6 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX10-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v4, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v4 +; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7 +; GFX908-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4 +; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX8-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX7-NEXT: v_not_b32_e32 v8, v2 +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 +; GFX6-NEXT: v_not_b32_e32 v8, v2 +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret half %result +} + +define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX940-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX10-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 +; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX908-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX8-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX7-NEXT: v_not_b32_e32 v6, v2 +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 +; GFX6-NEXT: v_not_b32_e32 v6, v2 +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; bfloat +; -------------------------------------------------------------------- + +define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result + } + +define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v6, v3 +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v6, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v6, v3 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX10-NEXT: v_not_b32_e32 v6, v3 +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v6, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX908-NEXT: v_not_b32_e32 v6, v3 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 +; GFX8-NEXT: v_not_b32_e32 v6, v3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v4, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v4, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v4, v4 +; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB62_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 +; GFX940-NEXT: v_not_b32_e32 v4, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB62_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v4, v4 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB62_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v5, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX10-NEXT: v_not_b32_e32 v4, v4 +; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB62_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX90A-NEXT: v_not_b32_e32 v4, v4 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB62_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX908-NEXT: global_load_dword v5, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX908-NEXT: v_not_b32_e32 v4, v4 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v5 +; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB62_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX8-NEXT: flat_load_dword v5, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 +; GFX8-NEXT: v_not_b32_e32 v4, v4 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v5 +; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB62_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v7, v4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB62_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v7, v4 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX6-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, v3, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB62_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_not_b32_e32 v5, v5 +; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB63_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 +; GFX940-NEXT: v_not_b32_e32 v5, v5 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB63_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v5, v5 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB63_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_not_b32_e32 v5, v5 +; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB63_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX90A-NEXT: v_not_b32_e32 v5, v5 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB63_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX908-NEXT: v_not_b32_e32 v5, v5 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB63_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 +; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB63_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v5 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 +; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB63_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v5 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 +; GFX6-NEXT: v_mov_b32_e32 v7, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v7 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB63_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; <2 x half> +; -------------------------------------------------------------------- + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB64_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB64_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB64_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB64_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB64_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB64_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB65_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB65_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB65_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB65_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB65_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB65_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB66_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB66_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB66_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB66_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB66_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB66_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB67_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB67_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB67_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB67_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB67_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB68_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB68_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB68_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB68_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB68_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB69_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB69_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB69_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB69_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB69_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB70_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB70_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB70_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB70_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB70_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB70_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB71_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB71_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB71_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB71_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB71_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB71_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB72_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB72_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB72_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB72_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB72_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB72_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB73_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB73_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB73_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB73_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB73_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret void +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB74_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB74_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB74_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB74_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB74_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB74_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB75_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB75_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB75_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB75_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB75_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret void +} + +define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB76_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB76_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB76_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB76_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB76_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB76_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB77_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB77_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB77_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB77_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB77_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB77_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------- +; <2 x bfloat> +; -------------------------------------------------------------------- + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB78_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB78_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB78_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB78_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB78_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB78_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB79_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB79_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB79_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB79_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB79_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB79_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB80_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB80_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB80_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB80_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB80_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB80_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v1 +; GFX6-NEXT: v_mov_b32_e32 v6, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB80_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB81_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB81_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB81_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB81_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB81_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB81_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB81_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB82_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB82_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB82_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB82_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB82_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12722,48 +19528,42 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 -; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB82_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12771,318 +19571,411 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 -; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB82_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst - ret <2 x half> %result + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB83_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB83_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB83_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB83_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB83_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB83_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB83_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; <2 x bfloat> -; -------------------------------------------------------------------- - -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16: +define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13107,7 +20000,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -13115,21 +20008,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB84_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13149,29 +20042,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB84_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13190,28 +20083,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB84_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13230,67 +20125,68 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB84_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 -; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB84_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -13299,7 +20195,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13313,7 +20209,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -13321,21 +20217,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB84_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -13344,7 +20240,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13359,7 +20255,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -13367,253 +20263,251 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB84_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB85_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB85_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB85_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB85_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13622,43 +20516,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB85_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13667,50 +20559,48 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB85_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret <2 x bfloat> %result + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13719,30 +20609,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13767,7 +20657,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -13775,21 +20665,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB86_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13809,29 +20699,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB86_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13850,28 +20740,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13890,162 +20780,156 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB86_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB86_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB86_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB86_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16: +define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14059,7 +20943,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 @@ -14068,7 +20952,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14077,7 +20961,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14109,20 +20993,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB87_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14149,12 +21033,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB87_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14163,7 +21047,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14188,12 +21072,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14202,7 +21086,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14227,19 +21111,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB87_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14267,12 +21151,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB87_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14288,7 +21172,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -14310,12 +21194,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB87_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14331,7 +21215,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -14354,17 +21238,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB87_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret void } -define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14373,320 +21257,327 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB88_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB88_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB88_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB88_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB88_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_mov_b32_e32 v7, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB88_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret <2 x bfloat> %result } -define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14695,30 +21586,30 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14741,7 +21632,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -14750,20 +21641,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB89_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14782,7 +21673,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -14790,21 +21681,21 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB89_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14822,28 +21713,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB89_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14861,28 +21752,26 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB89_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14910,32 +21799,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB89_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -14957,32 +21842,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB89_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -15005,50 +21886,49 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB89_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -15073,7 +21953,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -15081,21 +21961,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB90_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -15115,29 +21995,29 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB90_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -15156,30 +22036,28 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB90_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -15198,68 +22076,67 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB90_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB90_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -15268,7 +22145,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -15282,7 +22159,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -15290,21 +22167,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB90_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -15313,7 +22190,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -15328,7 +22205,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 @@ -15336,52 +22213,51 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB90_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } -define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15404,7 +22280,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -15413,20 +22289,20 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB91_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15445,7 +22321,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -15453,21 +22329,21 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB91_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15485,30 +22361,28 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB91_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15526,28 +22400,26 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB91_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15575,19 +22447,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB91_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -15596,7 +22468,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -15610,7 +22482,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -15618,19 +22490,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB91_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 @@ -15639,7 +22511,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 @@ -15654,7 +22526,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 -; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 @@ -15662,13 +22534,12 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB91_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void } @@ -15679,162 +22550,162 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX12-LABEL: infer_as_before_atomic: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX12-NEXT: s_cbranch_execz .LBB58_2 +; GFX12-NEXT: s_cbranch_execz .LBB92_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[2:3] scope:SCOPE_DEV -; GFX12-NEXT: .LBB58_2: +; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: .LBB92_2: ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm ; ; GFX940-LABEL: infer_as_before_atomic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB58_2 +; GFX940-NEXT: s_cbranch_execz .LBB92_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX940-NEXT: .LBB58_2: +; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX940-NEXT: .LBB92_2: ; GFX940-NEXT: s_endpgm ; ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX11-NEXT: .LBB58_2: +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: .LBB92_2: ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: infer_as_before_atomic: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s5, exec_lo -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10-NEXT: s_mov_b32 s3, exec_lo +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB58_3 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_cbranch_execz .LBB92_3 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB58_2 -; GFX10-NEXT: .LBB58_3: +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB92_2 +; GFX10-NEXT: .LBB92_3: ; GFX10-NEXT: s_endpgm ; ; GFX90A-LABEL: infer_as_before_atomic: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB58_2 +; GFX90A-NEXT: s_cbranch_execz .LBB92_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX90A-NEXT: .LBB58_2: +; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX90A-NEXT: .LBB92_2: ; GFX90A-NEXT: s_endpgm ; ; GFX908-LABEL: infer_as_before_atomic: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX908-NEXT: s_cbranch_execz .LBB58_2 +; GFX908-NEXT: s_cbranch_execz .LBB92_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[2:3] -; GFX908-NEXT: .LBB58_2: +; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX908-NEXT: .LBB92_2: ; GFX908-NEXT: s_endpgm ; ; GFX8-LABEL: infer_as_before_atomic: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB58_3 +; GFX8-NEXT: s_cbranch_execz .LBB92_3 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX8-NEXT: s_bcnt1_i32_b64 s5, s[0:1] -; GFX8-NEXT: s_mov_b64 s[0:1], 0 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s5 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v4, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v4 ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -15843,32 +22714,32 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_cbranch_execnz .LBB58_2 -; GFX8-NEXT: .LBB58_3: +; GFX8-NEXT: s_cbranch_execnz .LBB92_2 +; GFX8-NEXT: .LBB92_3: ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: infer_as_before_atomic: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b64 s[4:5], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7-NEXT: s_cbranch_execz .LBB58_3 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_cbranch_execz .LBB92_3 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 @@ -15879,32 +22750,32 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB58_2 -; GFX7-NEXT: .LBB58_3: +; GFX7-NEXT: s_cbranch_execnz .LBB92_2 +; GFX7-NEXT: .LBB92_3: ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: infer_as_before_atomic: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_cbranch_execz .LBB58_3 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX6-NEXT: s_cbranch_execz .LBB92_3 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX6-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: .LBB58_2: ; %atomicrmw.start +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: .LBB92_2: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_add_f32_e32 v0, v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -15916,13 +22787,15 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 { ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB58_2 -; GFX6-NEXT: .LBB58_3: +; GFX6-NEXT: s_cbranch_execnz .LBB92_2 +; GFX6-NEXT: .LBB92_3: ; GFX6-NEXT: s_endpgm %load = load ptr, ptr addrspace(4) %arg - %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 + %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 4f7b6164936f83..56f6f45f0b118a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -28,7 +28,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -53,7 +53,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -63,7 +63,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -73,7 +73,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -97,7 +97,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -121,7 +121,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -145,7 +145,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -158,7 +158,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -171,12 +171,12 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -190,7 +190,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -215,7 +215,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -225,7 +225,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -235,7 +235,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -259,7 +259,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -283,7 +283,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -308,7 +308,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -321,7 +321,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -335,12 +335,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -354,7 +354,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -379,7 +379,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -389,7 +389,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -399,7 +399,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -423,7 +423,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -447,7 +447,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -472,7 +472,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -485,7 +485,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -499,12 +499,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32: +define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -518,7 +518,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -542,7 +542,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -552,7 +552,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -562,7 +562,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -585,7 +585,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -608,7 +608,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -631,7 +631,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -643,7 +643,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -655,12 +655,12 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -674,7 +674,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -698,7 +698,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -708,7 +708,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -718,7 +718,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -741,7 +741,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -764,7 +764,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -789,7 +789,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -801,7 +801,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -814,12 +814,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -833,7 +833,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -857,7 +857,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -867,7 +867,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -877,7 +877,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -900,7 +900,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -923,7 +923,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -948,7 +948,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -960,7 +960,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -973,12 +973,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1010,7 +1010,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1035,7 +1035,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1063,7 +1063,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1089,7 +1089,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1115,7 +1115,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1139,7 +1139,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1164,7 +1164,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1194,7 +1194,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1226,12 +1226,12 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1262,7 +1262,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1286,7 +1286,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1313,7 +1313,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1338,7 +1338,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1363,7 +1363,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1386,7 +1386,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1411,7 +1411,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1440,7 +1440,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1471,16 +1471,12 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1494,7 +1490,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1519,7 +1515,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1529,7 +1525,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1539,7 +1535,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1563,7 +1559,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1587,7 +1583,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1611,7 +1607,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1624,7 +1620,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1637,12 +1633,12 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1651,15 +1647,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1669,7 +1665,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1681,30 +1677,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1713,7 +1709,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1725,10 +1721,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1737,7 +1733,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1749,64 +1745,66 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1815,15 +1813,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1833,7 +1831,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1845,30 +1843,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1877,7 +1875,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1889,10 +1887,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1901,7 +1899,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1913,64 +1911,62 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1979,154 +1975,162 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2135,157 +2139,162 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2294,15 +2303,15 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2311,7 +2320,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2323,30 +2332,30 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2354,7 +2363,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2366,10 +2375,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2377,7 +2386,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2389,11 +2398,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2414,324 +2421,212 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2739,10 +2634,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2752,62 +2647,30 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2815,10 +2678,8 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2829,10 +2690,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2840,7 +2701,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2852,11 +2713,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2877,7 +2738,223 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 +; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_max_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -2886,77 +2963,669 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64: +define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2965,35 +3634,35 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3001,7 +3670,7 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3009,89 +3678,89 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3099,13 +3768,13 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3113,368 +3782,350 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } -define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64: +define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3482,7 +4133,7 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3490,34 +4141,34 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3526,41 +4177,41 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -3568,18 +4219,20 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3592,375 +4245,390 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result } -define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret double %result } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16: +define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3978,7 +4646,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4000,13 +4668,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -4019,7 +4687,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4035,13 +4703,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4055,7 +4723,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4077,13 +4745,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4095,7 +4763,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4112,13 +4780,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4131,7 +4799,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4146,13 +4814,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4165,7 +4833,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4180,13 +4848,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4199,7 +4867,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4215,13 +4883,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4238,7 +4906,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4257,14 +4925,14 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4281,7 +4949,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4300,19 +4968,19 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4331,7 +4999,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4353,13 +5021,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -4374,7 +5042,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4390,13 +5058,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4411,7 +5079,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4433,13 +5101,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4452,7 +5120,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4469,13 +5137,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4489,7 +5157,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4504,13 +5172,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4524,7 +5192,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4539,13 +5207,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -4559,7 +5227,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4575,13 +5243,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4599,7 +5267,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4618,14 +5286,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4643,7 +5311,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4663,7 +5331,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -4671,12 +5339,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4695,7 +5363,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4717,13 +5385,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -4739,7 +5407,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4755,13 +5423,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4776,7 +5444,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4798,13 +5466,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4817,7 +5485,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4834,13 +5502,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4854,7 +5522,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4869,13 +5537,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4889,7 +5557,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4904,13 +5572,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4924,7 +5592,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4940,13 +5608,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4964,7 +5632,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4983,14 +5651,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5008,7 +5676,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5028,7 +5696,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5036,12 +5704,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16: +define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5059,7 +5727,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5081,12 +5749,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5099,7 +5767,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5115,12 +5783,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5134,7 +5802,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5156,12 +5824,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5173,7 +5841,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5190,12 +5858,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5208,7 +5876,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5223,12 +5891,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5241,7 +5909,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5256,12 +5924,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5274,7 +5942,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5290,12 +5958,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5312,7 +5980,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5331,12 +5999,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5353,7 +6021,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5373,17 +6041,17 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5402,7 +6070,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5424,12 +6092,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5444,7 +6112,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5460,12 +6128,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5480,7 +6148,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5502,12 +6170,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5520,7 +6188,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5537,12 +6205,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5556,7 +6224,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5571,12 +6239,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5590,7 +6258,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5605,12 +6273,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -5624,7 +6292,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5640,12 +6308,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5663,7 +6331,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5682,12 +6350,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5705,7 +6373,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5725,18 +6393,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5755,7 +6423,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5777,12 +6445,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5798,7 +6466,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5814,12 +6482,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5834,7 +6502,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5856,12 +6524,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5874,7 +6542,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5891,12 +6559,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5910,7 +6578,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5925,12 +6593,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5944,7 +6612,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5959,12 +6627,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5978,7 +6646,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5994,12 +6662,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6017,7 +6685,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6036,12 +6704,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6059,7 +6727,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6079,18 +6747,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6100,7 +6768,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6119,20 +6787,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6146,19 +6814,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6177,19 +6845,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6205,20 +6873,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6231,20 +6899,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6257,13 +6925,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6271,7 +6939,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6285,12 +6953,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6301,7 +6969,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6318,13 +6986,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6335,7 +7003,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6353,19 +7021,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6375,7 +7043,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -6394,19 +7062,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6420,18 +7088,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6450,18 +7118,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6477,19 +7145,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6502,19 +7170,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6527,12 +7195,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -6540,7 +7208,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6554,12 +7222,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6570,7 +7238,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6587,12 +7255,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6603,7 +7271,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6621,18 +7289,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6651,7 +7319,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -6673,13 +7341,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6694,7 +7362,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -6710,13 +7378,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6731,7 +7399,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -6753,13 +7421,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6772,7 +7440,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6789,13 +7457,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6809,7 +7477,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6826,13 +7494,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6846,7 +7514,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6861,13 +7529,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6881,7 +7549,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6897,13 +7565,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6921,7 +7589,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6940,14 +7608,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6965,7 +7633,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6985,7 +7653,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -6993,12 +7661,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7017,7 +7685,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7039,12 +7707,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7059,7 +7727,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7075,12 +7743,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7095,7 +7763,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7117,12 +7785,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7135,7 +7803,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7152,12 +7820,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7171,7 +7839,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7188,12 +7856,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7207,7 +7875,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7222,12 +7890,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7241,7 +7909,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7257,12 +7925,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7280,7 +7948,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7299,12 +7967,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7322,7 +7990,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7342,13 +8010,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7356,8 +8024,8 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16: +define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7374,7 +8042,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7403,13 +8071,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -7423,7 +8091,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7445,13 +8113,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -7465,7 +8133,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7494,13 +8162,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -7512,7 +8180,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7533,13 +8201,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -7553,7 +8221,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7572,13 +8240,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -7592,7 +8260,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7611,13 +8279,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -7630,7 +8298,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7651,13 +8319,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7674,7 +8342,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7694,14 +8362,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -7718,7 +8386,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7738,19 +8406,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7769,7 +8437,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7798,13 +8466,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7820,7 +8488,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7842,13 +8510,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7864,7 +8532,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7893,13 +8561,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7912,7 +8580,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7933,13 +8601,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7954,7 +8622,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7973,13 +8641,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7994,7 +8662,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8013,13 +8681,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -8033,7 +8701,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8054,13 +8722,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8078,7 +8746,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8098,14 +8766,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8123,7 +8791,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8144,7 +8812,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8152,12 +8820,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8176,7 +8844,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8205,13 +8873,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8228,7 +8896,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8250,13 +8918,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8272,7 +8940,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8301,13 +8969,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8320,7 +8988,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8341,13 +9009,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8362,7 +9030,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8381,13 +9049,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8402,7 +9070,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8421,13 +9089,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8441,7 +9109,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8462,13 +9130,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8486,7 +9154,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8506,14 +9174,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8531,7 +9199,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8552,7 +9220,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8560,12 +9228,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16: +define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8582,7 +9250,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8610,12 +9278,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8629,7 +9297,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8651,12 +9319,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8670,7 +9338,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8698,12 +9366,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8715,7 +9383,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8736,12 +9404,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8755,7 +9423,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8774,12 +9442,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8793,7 +9461,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8812,12 +9480,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8830,7 +9498,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8851,12 +9519,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8873,7 +9541,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8893,12 +9561,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8915,7 +9583,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8936,17 +9604,17 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8965,7 +9633,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8993,12 +9661,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -9014,7 +9682,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9036,12 +9704,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9057,7 +9725,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9085,12 +9753,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9103,7 +9771,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9124,12 +9792,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9144,7 +9812,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9163,12 +9831,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9183,7 +9851,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9202,12 +9870,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9221,7 +9889,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9242,12 +9910,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9265,7 +9933,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9285,12 +9953,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9308,7 +9976,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9329,18 +9997,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9359,7 +10027,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9387,12 +10055,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -9409,7 +10077,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9431,12 +10099,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9452,7 +10120,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9480,12 +10148,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9498,7 +10166,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9519,12 +10187,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9539,7 +10207,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9558,12 +10226,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9578,7 +10246,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9597,12 +10265,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9616,7 +10284,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9637,12 +10305,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9660,7 +10328,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9680,12 +10348,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9703,7 +10371,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9724,18 +10392,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9745,7 +10413,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -9771,13 +10439,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9785,7 +10453,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -9806,20 +10474,20 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -9845,19 +10513,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -9878,13 +10546,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9892,7 +10560,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -9911,13 +10579,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9925,7 +10593,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -9944,13 +10612,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -9958,7 +10626,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -9978,12 +10646,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -9994,7 +10662,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10012,13 +10680,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10029,7 +10697,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10048,19 +10716,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10070,7 +10738,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10095,12 +10763,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10108,7 +10776,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10129,19 +10797,19 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10166,18 +10834,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10198,12 +10866,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10211,7 +10879,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10230,12 +10898,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10243,7 +10911,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10262,12 +10930,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10275,7 +10943,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10295,12 +10963,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10311,7 +10979,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10329,12 +10997,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10345,7 +11013,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10364,18 +11032,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10394,7 +11062,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -10423,13 +11091,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10445,7 +11113,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -10467,13 +11135,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10489,7 +11157,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -10518,13 +11186,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10537,7 +11205,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10558,13 +11226,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10579,7 +11247,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10600,13 +11268,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10621,7 +11289,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10640,13 +11308,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10660,7 +11328,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10681,13 +11349,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10705,7 +11373,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10725,14 +11393,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10750,7 +11418,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10771,7 +11439,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -10779,12 +11447,12 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10803,7 +11471,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10831,12 +11499,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10852,7 +11520,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10874,12 +11542,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10895,7 +11563,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10923,12 +11591,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10941,7 +11609,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10962,12 +11630,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10982,7 +11650,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11003,12 +11671,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11023,7 +11691,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11042,12 +11710,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11061,7 +11729,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11082,12 +11750,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11105,7 +11773,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11125,12 +11793,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11148,7 +11816,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11169,13 +11837,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11183,8 +11851,8 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1 ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11194,7 +11862,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11210,19 +11878,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11236,19 +11904,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11264,19 +11932,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11290,19 +11958,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11314,19 +11982,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11338,20 +12006,20 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11366,13 +12034,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11389,7 +12057,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11414,14 +12082,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11438,7 +12106,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11464,19 +12132,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11486,7 +12154,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11502,19 +12170,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11528,19 +12196,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11556,19 +12224,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11582,19 +12250,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11606,19 +12274,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11630,13 +12298,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -11645,7 +12313,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11660,12 +12328,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11682,7 +12350,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11707,14 +12375,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11731,7 +12399,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11757,7 +12425,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -11765,12 +12433,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11780,7 +12448,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11796,19 +12464,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11822,19 +12490,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11850,19 +12518,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11876,19 +12544,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11900,19 +12568,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11924,13 +12592,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -11939,7 +12607,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11954,12 +12622,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -11980,7 +12648,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12005,12 +12673,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12031,7 +12699,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12057,18 +12725,18 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16: +define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12078,7 +12746,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12094,18 +12762,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12119,18 +12787,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12146,18 +12814,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12171,18 +12839,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12194,18 +12862,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12217,19 +12885,19 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12244,12 +12912,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12266,7 +12934,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12291,12 +12959,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12313,7 +12981,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12339,17 +13007,17 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12359,7 +13027,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12375,18 +13043,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12400,18 +13068,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12427,18 +13095,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12452,18 +13120,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12475,18 +13143,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12498,12 +13166,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -12512,7 +13180,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12527,12 +13195,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12549,7 +13217,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12574,12 +13242,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12596,7 +13264,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12622,18 +13290,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12643,7 +13311,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12659,18 +13327,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12684,18 +13352,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12711,18 +13379,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12736,18 +13404,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12759,18 +13427,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12782,12 +13450,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -12796,7 +13464,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12811,12 +13479,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12837,7 +13505,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12862,12 +13530,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12888,7 +13556,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12914,18 +13582,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12935,7 +13603,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12951,19 +13619,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12977,19 +13645,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13005,19 +13673,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13031,19 +13699,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13057,19 +13725,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13081,13 +13749,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13096,7 +13764,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13111,12 +13779,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13133,7 +13801,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13158,14 +13826,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13182,7 +13850,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13208,7 +13876,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13216,12 +13884,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13231,7 +13899,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13247,18 +13915,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13272,18 +13940,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13299,18 +13967,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13324,18 +13992,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13349,18 +14017,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13372,12 +14040,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13386,7 +14054,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13401,12 +14069,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13423,7 +14091,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13448,12 +14116,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13470,7 +14138,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13496,13 +14164,13 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13510,8 +14178,8 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace( ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13522,7 +14190,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13555,13 +14223,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -13570,7 +14238,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13597,13 +14265,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -13612,7 +14280,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13645,21 +14313,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13686,13 +14354,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -13701,7 +14369,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13726,13 +14394,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -13741,7 +14409,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13766,20 +14434,20 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13807,13 +14475,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13829,7 +14497,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13851,14 +14519,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13874,7 +14542,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13897,19 +14565,19 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13920,7 +14588,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13953,13 +14621,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -13968,7 +14636,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13995,13 +14663,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -14010,7 +14678,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14043,21 +14711,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14084,13 +14752,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14099,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14124,13 +14792,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14139,7 +14807,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14164,13 +14832,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14179,7 +14847,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14207,12 +14875,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14228,7 +14896,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14250,14 +14918,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14273,7 +14941,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14296,7 +14964,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14304,12 +14972,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14320,7 +14988,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14353,13 +15021,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14368,7 +15036,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14395,13 +15063,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -14410,7 +15078,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14443,21 +15111,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14484,13 +15152,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14499,7 +15167,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14524,13 +15192,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14539,7 +15207,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14564,13 +15232,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -14579,7 +15247,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14607,12 +15275,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -14632,7 +15300,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14654,12 +15322,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -14679,7 +15347,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14702,18 +15370,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16: +define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14724,7 +15392,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14756,12 +15424,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14770,7 +15438,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14797,12 +15465,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14811,7 +15479,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14843,20 +15511,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14883,12 +15551,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14897,7 +15565,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14922,12 +15590,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14936,7 +15604,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14961,19 +15629,19 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15001,12 +15669,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15022,7 +15690,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15044,12 +15712,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15065,7 +15733,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15088,17 +15756,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15109,7 +15777,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15141,12 +15809,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15155,7 +15823,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15182,12 +15850,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15196,7 +15864,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15228,20 +15896,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15268,12 +15936,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15282,7 +15950,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15307,12 +15975,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15321,7 +15989,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15346,12 +16014,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15360,7 +16028,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15388,12 +16056,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15409,7 +16077,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15431,12 +16099,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15452,7 +16120,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15475,18 +16143,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15497,7 +16165,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15529,12 +16197,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15543,7 +16211,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15570,12 +16238,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15584,7 +16252,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15616,20 +16284,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15656,12 +16324,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15670,7 +16338,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15695,12 +16363,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15709,7 +16377,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15734,12 +16402,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -15748,7 +16416,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15776,12 +16444,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15801,7 +16469,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15823,12 +16491,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15848,7 +16516,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15871,18 +16539,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15893,7 +16561,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -15926,13 +16594,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15941,7 +16609,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15968,13 +16636,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15983,7 +16651,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -16016,21 +16684,21 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16057,13 +16725,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16072,7 +16740,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16099,13 +16767,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16114,7 +16782,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16139,13 +16807,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16154,7 +16822,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16182,12 +16850,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16203,7 +16871,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16225,14 +16893,14 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16248,7 +16916,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16271,7 +16939,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16279,12 +16947,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16295,7 +16963,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16327,12 +16995,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16341,7 +17009,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16368,12 +17036,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16382,7 +17050,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16414,20 +17082,20 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16454,12 +17122,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16468,7 +17136,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16495,12 +17163,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16509,7 +17177,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16534,12 +17202,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -16548,7 +17216,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16576,12 +17244,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16597,7 +17265,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16619,12 +17287,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16640,7 +17308,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16663,15 +17331,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 591e01b11bd245..534506fb92d131 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -13,8 +13,8 @@ ; float ; -------------------------------------------------------------------- -define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -28,7 +28,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -53,7 +53,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -63,7 +63,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -73,7 +73,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -97,7 +97,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -121,7 +121,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -145,7 +145,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -158,7 +158,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -171,12 +171,12 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -190,7 +190,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -215,7 +215,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -225,7 +225,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -235,7 +235,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -259,7 +259,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -283,7 +283,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -308,7 +308,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -321,7 +321,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -335,12 +335,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -354,7 +354,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -379,7 +379,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -389,7 +389,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -399,7 +399,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -423,7 +423,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -447,7 +447,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -472,7 +472,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -485,7 +485,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -499,12 +499,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32: +define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -518,7 +518,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -542,7 +542,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -552,7 +552,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -562,7 +562,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -585,7 +585,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -608,7 +608,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -631,7 +631,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -643,7 +643,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -655,12 +655,12 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -674,7 +674,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -698,7 +698,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -708,7 +708,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -718,7 +718,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -741,7 +741,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -764,7 +764,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -789,7 +789,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -801,7 +801,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -814,12 +814,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -833,7 +833,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -857,7 +857,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -867,7 +867,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -877,7 +877,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -900,7 +900,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -923,7 +923,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -948,7 +948,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -960,7 +960,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -973,12 +973,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1010,7 +1010,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1035,7 +1035,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1063,7 +1063,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1089,7 +1089,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1115,7 +1115,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1139,7 +1139,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -1164,7 +1164,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1194,7 +1194,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1226,12 +1226,12 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1262,7 +1262,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1286,7 +1286,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -1313,7 +1313,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1338,7 +1338,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1363,7 +1363,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -1386,7 +1386,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -1411,7 +1411,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1440,7 +1440,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1471,16 +1471,12 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -; -------------------------------------------------------------------- -; float with ftz/daz -; -------------------------------------------------------------------- - -define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1494,7 +1490,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -1519,7 +1515,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1529,7 +1525,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1539,7 +1535,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -1563,7 +1559,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1587,7 +1583,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] @@ -1611,7 +1607,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -1624,7 +1620,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -1637,12 +1633,12 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1651,15 +1647,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1669,7 +1665,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1681,30 +1677,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1713,7 +1709,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1725,10 +1721,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start @@ -1737,7 +1733,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1749,64 +1745,66 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret float %result } -define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; -------------------------------------------------------------------- +; float with ftz/daz +; -------------------------------------------------------------------- + +define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1815,15 +1813,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1833,7 +1831,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1845,30 +1843,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1877,7 +1875,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 ; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1889,10 +1887,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start @@ -1901,7 +1899,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 ; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 @@ -1913,64 +1911,62 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } -define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -1979,154 +1975,162 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB11_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB11_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB11_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2135,157 +2139,162 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB12_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB12_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB12_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst - ret void + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result } -define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -2294,15 +2303,15 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2311,7 +2320,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2323,30 +2332,30 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2354,7 +2363,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2366,10 +2375,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start @@ -2377,7 +2386,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2389,11 +2398,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2414,324 +2421,212 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_cbranch_execnz .LBB14_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB14_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v4, v3 -; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 -; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 ; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 -; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_cbranch_execnz .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB14_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst - ret float %result + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2739,10 +2634,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -2752,62 +2647,30 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:-2048 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2815,10 +2678,8 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2829,10 +2690,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start @@ -2840,7 +2701,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 @@ -2852,11 +2713,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 @@ -2877,7 +2738,223 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_movk_i32 s4, 0xf800 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 +; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xf800 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 +; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4 +; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 +; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -2886,77 +2963,669 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 -; GFX7-NEXT: v_mov_b32_e32 v5, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX7-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; GFX6-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB16_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 + %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +; -------------------------------------------------------------------- +; double +; -------------------------------------------------------------------- + +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v5 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB15_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2 -; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB15_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst - ret void + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %result } -; -------------------------------------------------------------------- -; double -; -------------------------------------------------------------------- - -define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64: +define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -2965,35 +3634,35 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3001,7 +3670,7 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3009,89 +3678,89 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 ; GFX908-NEXT: v_mov_b32_e32 v6, v4 ; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] ; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: s_cbranch_execnz .LBB20_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -3099,13 +3768,13 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -3113,368 +3782,350 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } -define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: s_cbranch_execnz .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX908-NEXT: v_mov_b32_e32 v5, v3 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, v0 -; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 +; GFX8-NEXT: s_cbranch_execnz .LBB22_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret double %result + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void } -define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64: +define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3482,7 +4133,7 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] @@ -3490,34 +4141,34 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -3526,41 +4177,41 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -3568,18 +4219,20 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3592,375 +4245,390 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 +; GFX8-NEXT: s_cbranch_execnz .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_movk_i32 s4, 0xf800 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_movk_i32 s4, 0xf800 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst + %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 + %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:2040 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 +; GFX8-NEXT: s_cbranch_execnz .LBB24_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 + ret double %result } -define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] +; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] ; GFX12-NEXT: global_wb scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 +; GFX8-NEXT: s_cbranch_execnz .LBB25_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xf800 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s5, -1 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xf800 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s5, -1 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256 - %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst - ret void + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 + ret double %result } ; -------------------------------------------------------------------- ; half ; -------------------------------------------------------------------- -define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16: +define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -3978,7 +4646,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4000,13 +4668,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -4019,7 +4687,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4035,13 +4703,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -4055,7 +4723,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4077,13 +4745,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -4095,7 +4763,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4112,13 +4780,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -4131,7 +4799,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4146,13 +4814,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -4165,7 +4833,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4180,13 +4848,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -4199,7 +4867,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4215,13 +4883,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 +; GFX8-NEXT: s_cbranch_execnz .LBB26_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -4238,7 +4906,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 -; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4257,14 +4925,14 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB22_1 +; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -4281,7 +4949,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 -; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -4300,19 +4968,19 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB22_1 +; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4331,7 +4999,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4353,13 +5021,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -4374,7 +5042,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4390,13 +5058,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4411,7 +5079,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4433,13 +5101,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -4452,7 +5120,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4469,13 +5137,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4489,7 +5157,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4504,13 +5172,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -4524,7 +5192,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4539,13 +5207,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -4559,7 +5227,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4575,13 +5243,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 +; GFX8-NEXT: s_cbranch_execnz .LBB27_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4599,7 +5267,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4618,14 +5286,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB23_1 +; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -4643,7 +5311,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4663,7 +5331,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB23_1 +; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -4671,12 +5339,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -4695,7 +5363,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -4717,13 +5385,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -4739,7 +5407,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -4755,13 +5423,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4776,7 +5444,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -4798,13 +5466,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -4817,7 +5485,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -4834,13 +5502,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4854,7 +5522,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -4869,13 +5537,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4889,7 +5557,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -4904,13 +5572,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -4924,7 +5592,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -4940,13 +5608,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 +; GFX8-NEXT: s_cbranch_execnz .LBB28_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -4964,7 +5632,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -4983,14 +5651,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 +; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -5008,7 +5676,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -5028,7 +5696,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB24_1 +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -5036,12 +5704,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16: +define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5059,7 +5727,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5081,12 +5749,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -5099,7 +5767,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: v_not_b32_e32 v6, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5115,12 +5783,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v3, v0 @@ -5134,7 +5802,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5156,12 +5824,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -5173,7 +5841,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5190,12 +5858,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -5208,7 +5876,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: v_not_b32_e32 v6, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 @@ -5223,12 +5891,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -5241,7 +5909,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: v_not_b32_e32 v6, v3 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5256,12 +5924,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -5274,7 +5942,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -5290,12 +5958,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 +; GFX8-NEXT: s_cbranch_execnz .LBB29_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -5312,7 +5980,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5331,12 +5999,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB25_1 +; GFX7-NEXT: s_cbranch_execnz .LBB29_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -5353,7 +6021,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 @@ -5373,17 +6041,17 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB25_1 +; GFX6-NEXT: s_cbranch_execnz .LBB29_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5402,7 +6070,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5424,12 +6092,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -5444,7 +6112,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5460,12 +6128,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5480,7 +6148,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5502,12 +6170,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -5520,7 +6188,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5537,12 +6205,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5556,7 +6224,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5571,12 +6239,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -5590,7 +6258,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5605,12 +6273,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -5624,7 +6292,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5640,12 +6308,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB26_1 +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5663,7 +6331,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5682,12 +6350,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB26_1 +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -5705,7 +6373,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5725,18 +6393,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB26_1 +; GFX6-NEXT: s_cbranch_execnz .LBB30_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -5755,7 +6423,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5777,12 +6445,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -5798,7 +6466,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5814,12 +6482,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5834,7 +6502,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5856,12 +6524,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -5874,7 +6542,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5891,12 +6559,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5910,7 +6578,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5925,12 +6593,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5944,7 +6612,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5959,12 +6627,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -5978,7 +6646,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -5994,12 +6662,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB27_1 +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6017,7 +6685,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6036,12 +6704,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB27_1 +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -6059,7 +6727,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -6079,18 +6747,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB27_1 +; GFX6-NEXT: s_cbranch_execnz .LBB31_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6100,7 +6768,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -6119,20 +6787,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -6146,19 +6814,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -6177,19 +6845,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -6205,20 +6873,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -6231,20 +6899,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -6257,13 +6925,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6271,7 +6939,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -6285,12 +6953,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB28_1 +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6301,7 +6969,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6318,13 +6986,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB28_1 +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6335,7 +7003,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6353,19 +7021,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6375,7 +7043,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 @@ -6394,19 +7062,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6420,18 +7088,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6450,18 +7118,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6477,19 +7145,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6502,19 +7170,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 ; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 -; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6527,12 +7195,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -6540,7 +7208,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 @@ -6554,12 +7222,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB29_1 +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -6570,7 +7238,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6587,12 +7255,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB29_1 +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -6603,7 +7271,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 @@ -6621,18 +7289,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB29_1 +; GFX6-NEXT: s_cbranch_execnz .LBB33_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -6651,7 +7319,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -6673,13 +7341,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -6694,7 +7362,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_not_b32_e32 v4, v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -6710,13 +7378,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6731,7 +7399,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -6753,13 +7421,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -6772,7 +7440,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -6789,13 +7457,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6809,7 +7477,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_not_b32_e32 v4, v4 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -6826,13 +7494,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -6846,7 +7514,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_not_b32_e32 v4, v4 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -6861,13 +7529,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -6881,7 +7549,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -6897,13 +7565,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6921,7 +7589,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6940,14 +7608,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -6965,7 +7633,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 -; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -6985,7 +7653,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB30_1 +; GFX6-NEXT: s_cbranch_execnz .LBB34_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -6993,12 +7661,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } -define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7017,7 +7685,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7039,12 +7707,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7059,7 +7727,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_not_b32_e32 v5, v5 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7075,12 +7743,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7095,7 +7763,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7117,12 +7785,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -7135,7 +7803,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7152,12 +7820,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7171,7 +7839,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_not_b32_e32 v5, v5 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7188,12 +7856,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -7207,7 +7875,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_not_b32_e32 v5, v5 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7222,12 +7890,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -7241,7 +7909,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7257,12 +7925,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7280,7 +7948,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7299,12 +7967,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -7322,7 +7990,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 -; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -7342,13 +8010,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB31_1 +; GFX6-NEXT: s_cbranch_execnz .LBB35_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7356,8 +8024,8 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) ; bfloat ; -------------------------------------------------------------------- -define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16: +define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7374,7 +8042,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7403,13 +8071,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -7423,7 +8091,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7445,13 +8113,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: s_cbranch_execnz .LBB36_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -7465,7 +8133,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7494,13 +8162,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -7512,7 +8180,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7533,13 +8201,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -7553,7 +8221,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7572,13 +8240,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -7592,7 +8260,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -7611,13 +8279,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: s_cbranch_execnz .LBB36_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -7630,7 +8298,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -7651,13 +8319,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -7674,7 +8342,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7694,14 +8362,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -7718,7 +8386,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4 @@ -7738,19 +8406,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB32_1 +; GFX6-NEXT: s_cbranch_execnz .LBB36_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -7769,7 +8437,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -7798,13 +8466,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: s_cbranch_execnz .LBB37_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -7820,7 +8488,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -7842,13 +8510,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: s_cbranch_execnz .LBB37_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7864,7 +8532,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -7893,13 +8561,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -7912,7 +8580,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -7933,13 +8601,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7954,7 +8622,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -7973,13 +8641,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -7994,7 +8662,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8013,13 +8681,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: s_cbranch_execnz .LBB37_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -8033,7 +8701,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8054,13 +8722,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8078,7 +8746,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8098,14 +8766,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -8123,7 +8791,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8144,7 +8812,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB33_1 +; GFX6-NEXT: s_cbranch_execnz .LBB37_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8152,12 +8820,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8176,7 +8844,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -8205,13 +8873,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: s_cbranch_execnz .LBB38_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -8228,7 +8896,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -8250,13 +8918,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: s_cbranch_execnz .LBB38_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8272,7 +8940,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -8301,13 +8969,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: s_cbranch_execnz .LBB38_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 @@ -8320,7 +8988,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -8341,13 +9009,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: s_cbranch_execnz .LBB38_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8362,7 +9030,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -8381,13 +9049,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8402,7 +9070,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -8421,13 +9089,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 +; GFX908-NEXT: s_cbranch_execnz .LBB38_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -8441,7 +9109,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -8462,13 +9130,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8486,7 +9154,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8506,14 +9174,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -8531,7 +9199,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -8552,7 +9220,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB34_1 +; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -8560,12 +9228,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16: +define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8582,7 +9250,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8610,12 +9278,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -8629,7 +9297,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8651,12 +9319,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v5, v4 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: s_cbranch_execnz .LBB39_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 @@ -8670,7 +9338,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v6, v3 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8698,12 +9366,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, v0 @@ -8715,7 +9383,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff ; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8736,12 +9404,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -8755,7 +9423,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8774,12 +9442,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v5, v4 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v3, v0 @@ -8793,7 +9461,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8812,12 +9480,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: s_cbranch_execnz .LBB39_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v0 @@ -8830,7 +9498,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: v_not_b32_e32 v6, v3 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -8851,12 +9519,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, v0 @@ -8873,7 +9541,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: v_not_b32_e32 v6, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8893,12 +9561,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v4, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v3, v0 @@ -8915,7 +9583,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: v_not_b32_e32 v6, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 @@ -8936,17 +9604,17 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v4, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB35_1 +; GFX6-NEXT: s_cbranch_execnz .LBB39_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -8965,7 +9633,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -8993,12 +9661,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: s_cbranch_execnz .LBB40_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -9014,7 +9682,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9036,12 +9704,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB36_1 +; GFX940-NEXT: s_cbranch_execnz .LBB40_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9057,7 +9725,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9085,12 +9753,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: s_cbranch_execnz .LBB40_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -9103,7 +9771,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9124,12 +9792,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: s_cbranch_execnz .LBB40_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9144,7 +9812,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9163,12 +9831,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -9183,7 +9851,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9202,12 +9870,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB36_1 +; GFX908-NEXT: s_cbranch_execnz .LBB40_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -9221,7 +9889,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9242,12 +9910,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9265,7 +9933,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9285,12 +9953,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -9308,7 +9976,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9329,18 +9997,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB36_1 +; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9359,7 +10027,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9387,12 +10055,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-NEXT: s_cbranch_execnz .LBB41_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_movk_i32 s0, 0xf800 @@ -9409,7 +10077,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9431,12 +10099,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB37_1 +; GFX940-NEXT: s_cbranch_execnz .LBB41_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9452,7 +10120,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9480,12 +10148,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: s_cbranch_execnz .LBB41_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 @@ -9498,7 +10166,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9519,12 +10187,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: s_cbranch_execnz .LBB41_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9539,7 +10207,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9558,12 +10226,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9578,7 +10246,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9597,12 +10265,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB37_1 +; GFX908-NEXT: s_cbranch_execnz .LBB41_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 @@ -9616,7 +10284,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -9637,12 +10305,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9660,7 +10328,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9680,12 +10348,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 @@ -9703,7 +10371,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -9724,18 +10392,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB37_1 +; GFX6-NEXT: s_cbranch_execnz .LBB41_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -9745,7 +10413,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -9771,13 +10439,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-NEXT: s_cbranch_execnz .LBB42_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9785,7 +10453,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -9806,20 +10474,20 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB38_1 +; GFX940-NEXT: s_cbranch_execnz .LBB42_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -9845,19 +10513,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-NEXT: s_cbranch_execnz .LBB42_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -9878,13 +10546,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB38_1 +; GFX10-NEXT: s_cbranch_execnz .LBB42_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9892,7 +10560,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -9911,13 +10579,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -9925,7 +10593,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -9944,13 +10612,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB38_1 +; GFX908-NEXT: s_cbranch_execnz .LBB42_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -9958,7 +10626,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -9978,12 +10646,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -9994,7 +10662,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10012,13 +10680,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4: +; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10029,7 +10697,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10048,19 +10716,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB38_1 +; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10070,7 +10738,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10095,12 +10763,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: s_cbranch_execnz .LBB43_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10108,7 +10776,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10129,19 +10797,19 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB39_1 +; GFX940-NEXT: s_cbranch_execnz .LBB43_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10166,18 +10834,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: s_cbranch_execnz .LBB43_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10198,12 +10866,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: s_cbranch_execnz .LBB43_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10211,7 +10879,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff ; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10230,12 +10898,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046 @@ -10243,7 +10911,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff ; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 -; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10262,12 +10930,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB39_1 +; GFX908-NEXT: s_cbranch_execnz .LBB43_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 @@ -10275,7 +10943,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10295,12 +10963,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -10311,7 +10979,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10329,12 +10997,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -10345,7 +11013,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -10364,18 +11032,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB39_1 +; GFX6-NEXT: s_cbranch_execnz .LBB43_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10394,7 +11062,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v5 @@ -10423,13 +11091,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-NEXT: s_cbranch_execnz .LBB44_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10445,7 +11113,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v5 @@ -10467,13 +11135,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB40_1 +; GFX940-NEXT: s_cbranch_execnz .LBB44_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10489,7 +11157,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v4, v4 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v5 @@ -10518,13 +11186,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-NEXT: s_cbranch_execnz .LBB44_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 @@ -10537,7 +11205,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v5 @@ -10558,13 +11226,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB40_1 +; GFX10-NEXT: s_cbranch_execnz .LBB44_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10579,7 +11247,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -10600,13 +11268,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 @@ -10621,7 +11289,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v5 @@ -10640,13 +11308,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB40_1 +; GFX908-NEXT: s_cbranch_execnz .LBB44_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 @@ -10660,7 +11328,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v5 @@ -10681,13 +11349,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10705,7 +11373,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v7, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10725,14 +11393,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -10750,7 +11418,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v7, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3 @@ -10771,7 +11439,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB40_1 +; GFX6-NEXT: s_cbranch_execnz .LBB44_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 @@ -10779,12 +11447,12 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } -define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -10803,7 +11471,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10831,12 +11499,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-NEXT: s_cbranch_execnz .LBB45_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe @@ -10852,7 +11520,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff -; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10874,12 +11542,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB41_1 +; GFX940-NEXT: s_cbranch_execnz .LBB45_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10895,7 +11563,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_not_b32_e32 v5, v5 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -10923,12 +11591,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-NEXT: s_cbranch_execnz .LBB45_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 @@ -10941,7 +11609,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff ; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -10962,12 +11630,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB41_1 +; GFX10-NEXT: s_cbranch_execnz .LBB45_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -10982,7 +11650,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX90A-NEXT: s_movk_i32 s6, 0x7fff -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11003,12 +11671,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 @@ -11023,7 +11691,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX908-NEXT: s_movk_i32 s6, 0x7fff -; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11042,12 +11710,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB41_1 +; GFX908-NEXT: s_cbranch_execnz .LBB45_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 @@ -11061,7 +11729,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -11082,12 +11750,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11105,7 +11773,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_not_b32_e32 v5, v5 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11125,12 +11793,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v7 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 @@ -11148,7 +11816,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_not_b32_e32 v5, v5 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3 @@ -11169,13 +11837,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v7 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB41_1 +; GFX6-NEXT: s_cbranch_execnz .LBB45_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11183,8 +11851,8 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1 ; <2 x half> ; -------------------------------------------------------------------- -define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11194,7 +11862,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11210,19 +11878,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-NEXT: s_cbranch_execnz .LBB46_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11236,19 +11904,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 +; GFX940-NEXT: s_cbranch_execnz .LBB46_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11264,19 +11932,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-NEXT: s_cbranch_execnz .LBB46_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11290,19 +11958,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB42_1 +; GFX10-NEXT: s_cbranch_execnz .LBB46_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11314,19 +11982,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11338,20 +12006,20 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB42_1 +; GFX908-NEXT: s_cbranch_execnz .LBB46_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -11366,13 +12034,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11389,7 +12057,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11414,14 +12082,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11438,7 +12106,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11464,19 +12132,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB42_1 +; GFX6-NEXT: s_cbranch_execnz .LBB46_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11486,7 +12154,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11502,19 +12170,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-NEXT: s_cbranch_execnz .LBB47_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11528,19 +12196,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 +; GFX940-NEXT: s_cbranch_execnz .LBB47_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11556,19 +12224,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-NEXT: s_cbranch_execnz .LBB47_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11582,19 +12250,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB43_1 +; GFX10-NEXT: s_cbranch_execnz .LBB47_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11606,19 +12274,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11630,13 +12298,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB43_1 +; GFX908-NEXT: s_cbranch_execnz .LBB47_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -11645,7 +12313,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11660,12 +12328,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -11682,7 +12350,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11707,14 +12375,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -11731,7 +12399,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -11757,7 +12425,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB43_1 +; GFX6-NEXT: s_cbranch_execnz .LBB47_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -11765,12 +12433,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -11780,7 +12448,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -11796,19 +12464,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-NEXT: s_cbranch_execnz .LBB48_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -11822,19 +12490,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 +; GFX940-NEXT: s_cbranch_execnz .LBB48_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -11850,19 +12518,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-NEXT: s_cbranch_execnz .LBB48_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -11876,19 +12544,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB44_1 +; GFX10-NEXT: s_cbranch_execnz .LBB48_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -11900,19 +12568,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -11924,13 +12592,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB44_1 +; GFX908-NEXT: s_cbranch_execnz .LBB48_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -11939,7 +12607,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -11954,12 +12622,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -11980,7 +12648,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12005,12 +12673,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12031,7 +12699,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -12057,18 +12725,18 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB44_1 +; GFX6-NEXT: s_cbranch_execnz .LBB48_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16: +define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12078,7 +12746,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12094,18 +12762,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-NEXT: s_cbranch_execnz .LBB49_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12119,18 +12787,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12146,18 +12814,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-NEXT: s_cbranch_execnz .LBB49_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12171,18 +12839,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB45_1 +; GFX10-NEXT: s_cbranch_execnz .LBB49_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12194,18 +12862,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12217,19 +12885,19 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB45_1 +; GFX908-NEXT: s_cbranch_execnz .LBB49_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12244,12 +12912,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12266,7 +12934,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12291,12 +12959,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12313,7 +12981,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12339,17 +13007,17 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB45_1 +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12359,7 +13027,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12375,18 +13043,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-NEXT: s_cbranch_execnz .LBB50_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12400,18 +13068,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 +; GFX940-NEXT: s_cbranch_execnz .LBB50_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12427,18 +13095,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-NEXT: s_cbranch_execnz .LBB50_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12452,18 +13120,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB46_1 +; GFX10-NEXT: s_cbranch_execnz .LBB50_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12475,18 +13143,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12498,12 +13166,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB46_1 +; GFX908-NEXT: s_cbranch_execnz .LBB50_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -12512,7 +13180,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12527,12 +13195,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: s_cbranch_execnz .LBB50_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -12549,7 +13217,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12574,12 +13242,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -12596,7 +13264,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12622,18 +13290,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB46_1 +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12643,7 +13311,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -12659,18 +13327,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-NEXT: s_cbranch_execnz .LBB51_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12684,18 +13352,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 +; GFX940-NEXT: s_cbranch_execnz .LBB51_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12711,18 +13379,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-NEXT: s_cbranch_execnz .LBB51_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12736,18 +13404,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB47_1 +; GFX10-NEXT: s_cbranch_execnz .LBB51_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12759,18 +13427,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -12782,12 +13450,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB47_1 +; GFX908-NEXT: s_cbranch_execnz .LBB51_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -12796,7 +13464,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -12811,12 +13479,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: s_cbranch_execnz .LBB51_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -12837,7 +13505,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12862,12 +13530,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -12888,7 +13556,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -12914,18 +13582,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB47_1 +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -12935,7 +13603,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, v3 @@ -12951,19 +13619,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-NEXT: s_cbranch_execnz .LBB52_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v5, v3 @@ -12977,19 +13645,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 +; GFX940-NEXT: s_cbranch_execnz .LBB52_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, v3 @@ -13005,19 +13673,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-NEXT: s_cbranch_execnz .LBB52_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v3 @@ -13031,19 +13699,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB48_1 +; GFX10-NEXT: s_cbranch_execnz .LBB52_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 @@ -13057,19 +13725,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 -; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v4, v3 @@ -13081,13 +13749,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB48_1 +; GFX908-NEXT: s_cbranch_execnz .LBB52_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -13096,7 +13764,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -13111,12 +13779,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: s_cbranch_execnz .LBB52_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13133,7 +13801,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13158,14 +13826,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13182,7 +13850,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -13208,7 +13876,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB48_1 +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v2 @@ -13216,12 +13884,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } -define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13231,7 +13899,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 @@ -13247,18 +13915,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-NEXT: s_cbranch_execnz .LBB53_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13272,18 +13940,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: s_cbranch_execnz .LBB53_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 ; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13299,18 +13967,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-NEXT: s_cbranch_execnz .LBB53_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13324,18 +13992,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-NEXT: s_cbranch_execnz .LBB53_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13349,18 +14017,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 -; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 @@ -13372,12 +14040,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB49_1 +; GFX908-NEXT: s_cbranch_execnz .LBB53_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -13386,7 +14054,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[4:5], 0 ; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 -; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -13401,12 +14069,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: s_cbranch_execnz .LBB53_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13423,7 +14091,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13448,12 +14116,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13470,7 +14138,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 -; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -13496,13 +14164,13 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13510,8 +14178,8 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace( ; <2 x bfloat> ; -------------------------------------------------------------------- -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13522,7 +14190,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13555,13 +14223,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-NEXT: s_cbranch_execnz .LBB54_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -13570,7 +14238,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13597,13 +14265,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NEXT: s_cbranch_execnz .LBB54_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -13612,7 +14280,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -13645,21 +14313,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-NEXT: s_cbranch_execnz .LBB54_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -13686,13 +14354,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-NEXT: s_cbranch_execnz .LBB54_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -13701,7 +14369,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -13726,13 +14394,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -13741,7 +14409,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -13766,20 +14434,20 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB50_1 +; GFX908-NEXT: s_cbranch_execnz .LBB54_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 @@ -13807,13 +14475,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB50_1 +; GFX8-NEXT: s_cbranch_execnz .LBB54_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -13829,7 +14497,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13851,14 +14519,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -13874,7 +14542,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -13897,19 +14565,19 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -13920,7 +14588,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -13953,13 +14621,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-NEXT: s_cbranch_execnz .LBB55_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -13968,7 +14636,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -13995,13 +14663,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NEXT: s_cbranch_execnz .LBB55_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -14010,7 +14678,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14043,21 +14711,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-NEXT: s_cbranch_execnz .LBB55_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14084,13 +14752,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-NEXT: s_cbranch_execnz .LBB55_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14099,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14124,13 +14792,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -14139,7 +14807,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14164,13 +14832,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB51_1 +; GFX908-NEXT: s_cbranch_execnz .LBB55_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -14179,7 +14847,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14207,12 +14875,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB51_1 +; GFX8-NEXT: s_cbranch_execnz .LBB55_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -14228,7 +14896,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14250,14 +14918,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -14273,7 +14941,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -14296,7 +14964,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -14304,12 +14972,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14320,7 +14988,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -14353,13 +15021,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-NEXT: s_cbranch_execnz .LBB56_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14368,7 +15036,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -14395,13 +15063,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NEXT: s_cbranch_execnz .LBB56_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -14410,7 +15078,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -14443,21 +15111,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-NEXT: s_cbranch_execnz .LBB56_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -14484,13 +15152,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-NEXT: s_cbranch_execnz .LBB56_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14499,7 +15167,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -14524,13 +15192,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -14539,7 +15207,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -14564,13 +15232,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB52_1 +; GFX908-NEXT: s_cbranch_execnz .LBB56_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 @@ -14579,7 +15247,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -14607,12 +15275,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB52_1 +; GFX8-NEXT: s_cbranch_execnz .LBB56_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -14632,7 +15300,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14654,12 +15322,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -14679,7 +15347,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -14702,18 +15370,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16: +define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -14724,7 +15392,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14756,12 +15424,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-NEXT: s_cbranch_execnz .LBB57_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off @@ -14770,7 +15438,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14797,12 +15465,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NEXT: s_cbranch_execnz .LBB57_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -14811,7 +15479,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14843,20 +15511,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-NEXT: s_cbranch_execnz .LBB57_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14883,12 +15551,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-NEXT: s_cbranch_execnz .LBB57_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -14897,7 +15565,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14922,12 +15590,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -14936,7 +15604,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -14961,19 +15629,19 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB53_1 +; GFX908-NEXT: s_cbranch_execnz .LBB57_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15001,12 +15669,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB53_1 +; GFX8-NEXT: s_cbranch_execnz .LBB57_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15022,7 +15690,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15044,12 +15712,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15065,7 +15733,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15088,17 +15756,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15109,7 +15777,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15141,12 +15809,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-NEXT: s_cbranch_execnz .LBB58_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15155,7 +15823,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15182,12 +15850,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NEXT: s_cbranch_execnz .LBB58_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15196,7 +15864,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15228,20 +15896,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-NEXT: s_cbranch_execnz .LBB58_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15268,12 +15936,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-NEXT: s_cbranch_execnz .LBB58_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15282,7 +15950,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15307,12 +15975,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15321,7 +15989,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15346,12 +16014,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB54_1 +; GFX908-NEXT: s_cbranch_execnz .LBB58_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -15360,7 +16028,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15388,12 +16056,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB54_1 +; GFX8-NEXT: s_cbranch_execnz .LBB58_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -15409,7 +16077,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15431,12 +16099,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -15452,7 +16120,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15475,18 +16143,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15497,7 +16165,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15529,12 +16197,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-NEXT: s_cbranch_execnz .LBB59_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15543,7 +16211,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15570,12 +16238,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NEXT: s_cbranch_execnz .LBB59_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 @@ -15584,7 +16252,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15616,20 +16284,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-NEXT: s_cbranch_execnz .LBB59_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15656,12 +16324,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-NEXT: s_cbranch_execnz .LBB59_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15670,7 +16338,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15695,12 +16363,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048 @@ -15709,7 +16377,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15734,12 +16402,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB55_1 +; GFX908-NEXT: s_cbranch_execnz .LBB59_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 @@ -15748,7 +16416,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -15776,12 +16444,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB55_1 +; GFX8-NEXT: s_cbranch_execnz .LBB59_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_movk_i32 s4, 0xf800 @@ -15801,7 +16469,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15823,12 +16491,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg: +; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_movk_i32 s4, 0xf800 @@ -15848,7 +16516,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -15871,18 +16539,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -15893,7 +16561,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v3 @@ -15926,13 +16594,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-NEXT: s_cbranch_execnz .LBB60_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -15941,7 +16609,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v7, v3 @@ -15968,13 +16636,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NEXT: s_cbranch_execnz .LBB60_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -15983,7 +16651,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v6, v3 @@ -16016,21 +16684,21 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-NEXT: s_cbranch_execnz .LBB60_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v6, v3 @@ -16057,13 +16725,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-NEXT: s_cbranch_execnz .LBB60_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16072,7 +16740,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v7, v3 @@ -16099,13 +16767,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16114,7 +16782,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v6, v3 @@ -16139,13 +16807,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB56_1 +; GFX908-NEXT: s_cbranch_execnz .LBB60_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 @@ -16154,7 +16822,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 @@ -16182,12 +16850,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB56_1 +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16203,7 +16871,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16225,14 +16893,14 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16248,7 +16916,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -16271,7 +16939,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v0, v3 @@ -16279,12 +16947,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } -define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { +; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_expcnt 0x0 @@ -16295,7 +16963,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16327,12 +16995,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-NEXT: s_cbranch_execnz .LBB61_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16341,7 +17009,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_movk_i32 s4, 0x7fff ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16368,12 +17036,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NEXT: s_cbranch_execnz .LBB61_1 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 @@ -16382,7 +17050,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16414,20 +17082,20 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-NEXT: s_cbranch_execnz .LBB61_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16454,12 +17122,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-NEXT: s_cbranch_execnz .LBB61_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16468,7 +17136,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16495,12 +17163,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044 @@ -16509,7 +17177,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16534,12 +17202,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB57_1 +; GFX908-NEXT: s_cbranch_execnz .LBB61_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 @@ -16548,7 +17216,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16576,12 +17244,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_cbranch_execnz .LBB57_1 +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 @@ -16597,7 +17265,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16619,12 +17287,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos: +; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s6, 0 @@ -16640,7 +17308,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 @@ -16663,15 +17331,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB57_1 +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511 - %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll index 1f0ae39082865c..0612383c3f90b1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll @@ -6,25 +6,25 @@ ; FIXME: This will still fail for gfx6/7 and gfx10 subtargets. ; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9A-Z]+}}: DD348000 -; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v1, v0, v0, vcc // {{[0-9A-Z]+}}: 00020100 +; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9A-Z]+}}: 00000100 define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[0:1], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: .LBB0_2: diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll index c790187f9d108a..38b9c5df7faa1b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll @@ -49,8 +49,8 @@ define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) { ; R600-LABEL: available_externally_test -; GCN-PAL: s_mov_b32 s1, available_externally@abs32@hi -; GCN-PAL: s_mov_b32 s0, available_externally@abs32@lo +; GCN-PAL: s_mov_b32 s3, available_externally@abs32@hi +; GCN-PAL: s_mov_b32 s2, available_externally@abs32@lo define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) { %ptr = getelementptr [256 x i32], ptr addrspace(4) @available_externally, i32 0, i32 1 %val = load i32, ptr addrspace(4) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll index 7f6a3ad5c93460..b8ecbae3d3114c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half8: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v4, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -18,7 +18,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half8: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -28,7 +28,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -74,7 +74,7 @@ define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half6: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -84,7 +84,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half6: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -94,7 +94,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half6: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] @@ -132,7 +132,7 @@ define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half4: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v2, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -144,7 +144,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half4: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half4: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 { ; GFX908-LABEL: half2: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_load_dword v1, v0, s[0:1] @@ -198,7 +198,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX90A-LABEL: half2: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] @@ -208,7 +208,7 @@ define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr add ; ; GFX1030-LABEL: half2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll index e54cd64798a682..f709eae990bda2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc @@ -54,7 +54,7 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 9bee539b1e4e5c..dac3a3db7b450b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,8 +19,8 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_add_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,12 +32,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_add_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -50,8 +50,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -65,14 +65,14 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_add_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -80,12 +80,12 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_add_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -98,8 +98,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -112,8 +112,8 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_add_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -126,12 +126,12 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_add_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -144,29 +144,29 @@ entry: define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_add_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -174,12 +174,12 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_add_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -195,8 +195,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,29 +213,29 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_add_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -251,9 +251,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -268,18 +268,18 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -287,12 +287,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -310,9 +310,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -332,22 +332,22 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -356,12 +356,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -381,8 +381,8 @@ entry: define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,8 +394,8 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_add_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,12 +407,12 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_add_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -424,8 +424,8 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_add_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -460,11 +460,11 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -479,9 +479,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -496,16 +496,16 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_add_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_add v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -513,12 +513,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_add_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -535,9 +535,9 @@ entry: define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_add_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -557,20 +557,20 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -579,12 +579,12 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -616,8 +616,8 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_and_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -629,12 +629,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_and_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -647,8 +647,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -665,29 +665,29 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -703,9 +703,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -720,18 +720,18 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -739,12 +739,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -762,9 +762,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -784,22 +784,22 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -808,12 +808,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -833,8 +833,8 @@ entry: define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_and_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -846,8 +846,8 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_and_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -859,12 +859,12 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_and_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -876,8 +876,8 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_and_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -894,8 +894,8 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,11 +912,11 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -931,9 +931,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -948,16 +948,16 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_and_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_and v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -965,12 +965,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_and_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -987,9 +987,9 @@ entry: define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_and_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1009,20 +1009,20 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1031,12 +1031,12 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1055,8 +1055,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,8 +1068,8 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_sub_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1081,12 +1081,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_sub_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1099,8 +1099,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,29 +1117,29 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1155,9 +1155,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1172,18 +1172,18 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1191,12 +1191,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1214,9 +1214,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1236,22 +1236,22 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1260,12 +1260,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1285,8 +1285,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_sub_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1298,8 +1298,8 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_sub_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1311,12 +1311,12 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_sub_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1328,8 +1328,8 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_sub_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1346,8 +1346,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,11 +1364,11 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1383,9 +1383,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1400,16 +1400,16 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_sub_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_sub v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1417,12 +1417,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_sub_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1439,9 +1439,9 @@ entry: define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_sub_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1461,20 +1461,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1483,12 +1483,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1507,8 +1507,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,8 +1520,8 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_max_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1533,12 +1533,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_max_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -1551,8 +1551,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,28 +1568,28 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1604,9 +1604,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1619,29 +1619,29 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1657,9 +1657,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1678,20 +1678,20 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1701,12 +1701,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1725,8 +1725,8 @@ entry: define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_max_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1736,8 +1736,8 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_max_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,12 +1747,12 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_max_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -1762,8 +1762,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_max_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1779,8 +1779,8 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1796,11 +1796,11 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -1814,9 +1814,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -1829,27 +1829,27 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -1864,9 +1864,9 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -1885,18 +1885,18 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -1906,12 +1906,12 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -1929,8 +1929,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1940,8 +1940,8 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umax_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -1967,8 +1967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1984,28 +1984,28 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2020,9 +2020,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2035,29 +2035,29 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2073,9 +2073,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2094,20 +2094,20 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2117,12 +2117,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2141,8 +2141,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umax_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,8 +2152,8 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umax_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2163,12 +2163,12 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umax_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2178,8 +2178,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umax_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2195,8 +2195,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2212,11 +2212,11 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2230,9 +2230,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2245,27 +2245,27 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umax_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umax v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umax_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2280,9 +2280,9 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2301,18 +2301,18 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2322,12 +2322,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2345,8 +2345,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2356,8 +2356,8 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_min_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2367,12 +2367,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_min_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2383,8 +2383,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2400,28 +2400,28 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2436,9 +2436,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2451,29 +2451,29 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2489,9 +2489,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2510,20 +2510,20 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2533,12 +2533,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2557,8 +2557,8 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2568,8 +2568,8 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2579,12 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2594,8 +2594,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_min_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2611,8 +2611,8 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2628,11 +2628,11 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2646,9 +2646,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2661,27 +2661,27 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_min_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_smin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_min_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2696,9 +2696,9 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2717,18 +2717,18 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2738,12 +2738,12 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2761,8 +2761,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2772,8 +2772,8 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2783,12 +2783,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_umin_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -2799,8 +2799,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2816,28 +2816,28 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -2852,9 +2852,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -2867,29 +2867,29 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -2905,9 +2905,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -2926,20 +2926,20 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -2949,12 +2949,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -2973,8 +2973,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_umin_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,8 +2984,8 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_umin_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,12 +2995,12 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_umin_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3010,8 +3010,8 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_umin_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3027,8 +3027,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3044,11 +3044,11 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] @@ -3062,9 +3062,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3077,27 +3077,27 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_umin_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_umin v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_umin_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3112,9 +3112,9 @@ entry: define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_umin_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3133,18 +3133,18 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3154,12 +3154,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3177,8 +3177,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3190,8 +3190,8 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_or_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3203,12 +3203,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_or_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3221,8 +3221,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3239,29 +3239,29 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3277,9 +3277,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3294,18 +3294,18 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; VI-LABEL: atomic_or_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; ; GFX9-LABEL: atomic_or_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3336,9 +3336,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3358,22 +3358,22 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3382,12 +3382,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3407,8 +3407,8 @@ entry: define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_or_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,8 +3420,8 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_or_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3433,12 +3433,12 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_or_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3450,8 +3450,8 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_or_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3468,8 +3468,8 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3486,11 +3486,11 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3505,9 +3505,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3522,16 +3522,16 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_or_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3539,12 +3539,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_or_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3561,9 +3561,9 @@ entry: define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_or_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3583,20 +3583,20 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3605,12 +3605,12 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3629,8 +3629,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3642,8 +3642,8 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3655,12 +3655,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3673,8 +3673,8 @@ entry: define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) { ; SI-LABEL: atomic_xchg_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3686,8 +3686,8 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; VI-LABEL: atomic_xchg_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,12 +3699,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; ; GFX9-LABEL: atomic_xchg_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3717,8 +3717,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3735,29 +3735,29 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -3773,9 +3773,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -3790,18 +3790,18 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -3809,12 +3809,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -3832,9 +3832,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -3854,22 +3854,22 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -3878,12 +3878,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -3903,8 +3903,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3916,8 +3916,8 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,12 +3929,12 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -3946,8 +3946,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3964,8 +3964,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3982,11 +3982,11 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4001,9 +4001,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4018,16 +4018,16 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_xchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_swap v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4035,12 +4035,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_xchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4057,9 +4057,9 @@ entry: define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4079,20 +4079,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4101,12 +4101,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4125,7 +4125,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4140,7 +4140,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4155,7 +4155,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4173,8 +4173,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4192,31 +4192,31 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4233,10 +4233,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4252,19 +4252,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4273,17 +4273,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4298,10 +4298,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s10, s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4309,8 +4309,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 glc @@ -4322,24 +4322,24 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s9, s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4348,13 +4348,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4376,7 +4376,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4391,7 +4391,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; VI-LABEL: atomic_cmpxchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4406,7 +4406,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4423,8 +4423,8 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4442,8 +4442,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4461,12 +4461,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4482,10 +4482,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s7, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4501,17 +4501,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dword s6, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x3c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4520,17 +4520,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_cmpxchg_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x3c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4544,10 +4544,10 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) { ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s10, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0x11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s10, s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4555,8 +4555,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 glc @@ -4568,22 +4568,22 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s9, s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4592,13 +4592,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 -; GFX9-NEXT: s_load_dword s9, s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 @@ -4619,8 +4619,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4632,8 +4632,8 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_xor_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4645,12 +4645,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_xor_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4663,8 +4663,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,29 +4681,29 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4719,9 +4719,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4736,18 +4736,18 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4755,12 +4755,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -4778,9 +4778,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -4800,22 +4800,22 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -4824,12 +4824,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -4849,8 +4849,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_xor_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4862,8 +4862,8 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: atomic_xor_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,12 +4875,12 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_xor_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4892,8 +4892,8 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_xor_i32_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4910,8 +4910,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i32_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s8, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4928,11 +4928,11 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4947,9 +4947,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -4964,16 +4964,16 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_xor_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_xor v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4981,12 +4981,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_xor_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -5003,9 +5003,9 @@ entry: define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_xor_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5025,20 +5025,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -5047,12 +5047,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -5071,7 +5071,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5087,7 +5087,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5105,7 +5105,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5123,7 +5123,7 @@ entry: define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5141,7 +5141,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i32_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5159,7 +5159,7 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i32_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc @@ -5177,7 +5177,7 @@ entry: define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,7 +5193,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5229,7 +5229,7 @@ entry: define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5245,7 +5245,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5261,7 +5261,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5278,8 +5278,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5298,8 +5298,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5320,11 +5320,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5343,8 +5343,8 @@ entry: define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5363,8 +5363,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,11 +5383,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -5405,8 +5405,8 @@ entry: define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_load_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -5425,8 +5425,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5447,11 +5447,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc @@ -5470,8 +5470,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5481,25 +5481,25 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 @@ -5510,8 +5510,8 @@ entry: define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5521,23 +5521,23 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5547,8 +5547,8 @@ entry: define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5558,23 +5558,23 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic float %in, ptr addrspace(1) %out seq_cst, align 4 @@ -5584,8 +5584,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5598,8 +5598,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5614,14 +5614,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5634,8 +5634,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5648,8 +5648,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; VI-LABEL: atomic_store_f32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5664,14 +5664,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; ; GFX9-LABEL: atomic_store_f32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm entry: @@ -5684,8 +5684,8 @@ entry: define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5699,8 +5699,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5713,14 +5713,14 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5732,8 +5732,8 @@ entry: define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) { ; SI-LABEL: atomic_store_f32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 @@ -5747,8 +5747,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; VI-LABEL: atomic_store_f32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 @@ -5761,14 +5761,14 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; ; GFX9-LABEL: atomic_store_f32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: @@ -5780,7 +5780,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5796,7 +5796,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; VI-LABEL: atomic_load_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5812,7 +5812,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; ; GFX9-LABEL: atomic_load_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc @@ -5830,7 +5830,7 @@ entry: define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i8_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -5848,7 +5848,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; VI-LABEL: atomic_load_i8_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5866,7 +5866,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; ; GFX9-LABEL: atomic_load_i8_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc @@ -5884,8 +5884,8 @@ entry: define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5895,25 +5895,25 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_i8_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %out, i64 16 @@ -5924,8 +5924,8 @@ entry: define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i8: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5935,23 +5935,23 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1 @@ -5961,7 +5961,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5977,7 +5977,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5993,7 +5993,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -6011,7 +6011,7 @@ entry: define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_i16_negoffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -6029,7 +6029,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_i16_negoffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6047,7 +6047,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_i16_negoffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -6065,8 +6065,8 @@ entry: define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6076,25 +6076,25 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %out, i64 8 @@ -6105,8 +6105,8 @@ entry: define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6116,23 +6116,23 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6142,8 +6142,8 @@ entry: define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6153,25 +6153,25 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; ; VI-LABEL: atomic_store_f16_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr half, ptr addrspace(1) %out, i64 8 @@ -6182,8 +6182,8 @@ entry: define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6193,23 +6193,23 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm entry: store atomic half %in, ptr addrspace(1) %out seq_cst, align 2 @@ -6219,8 +6219,8 @@ entry: define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6230,25 +6230,25 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; ; VI-LABEL: atomic_store_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -6258,8 +6258,8 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_store_bf16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6269,23 +6269,23 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; ; VI-LABEL: atomic_store_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s2, s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_store_bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -6294,8 +6294,8 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6307,8 +6307,8 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_inc_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6320,12 +6320,12 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_inc_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6338,8 +6338,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6353,14 +6353,14 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_inc_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6368,12 +6368,12 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_inc_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6386,8 +6386,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6400,8 +6400,8 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_inc_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6414,12 +6414,12 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_inc_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6432,29 +6432,29 @@ entry: define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_inc_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6462,12 +6462,12 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_inc_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6482,8 +6482,8 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_inc_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6500,29 +6500,29 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6538,9 +6538,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6555,18 +6555,18 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_inc v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6574,12 +6574,12 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6597,9 +6597,9 @@ entry: define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_inc_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6619,22 +6619,22 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_inc_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -6643,12 +6643,12 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -6668,8 +6668,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6681,8 +6681,8 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: atomic_dec_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6694,12 +6694,12 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; ; GFX9-LABEL: atomic_dec_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6712,8 +6712,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_max_neg_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000 @@ -6727,14 +6727,14 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_dec_i32_max_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xfffff000 -; VI-NEXT: s_addc_u32 s1, s1, -1 +; VI-NEXT: s_add_u32 s0, s2, 0xfffff000 +; VI-NEXT: s_addc_u32 s1, s3, -1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6742,12 +6742,12 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_dec_i32_max_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:-4096 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6760,8 +6760,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_soffset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6774,8 +6774,8 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; VI-LABEL: atomic_dec_i32_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s5, 0x8ca0 @@ -6788,12 +6788,12 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; ; GFX9-LABEL: atomic_dec_i32_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:3232 +; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:3232 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -6806,29 +6806,29 @@ entry: define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_dec_i32_huge_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_i32_huge_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 0xdeac -; VI-NEXT: s_addc_u32 s1, s1, 0xabcd +; VI-NEXT: s_add_u32 s0, s2, 0xdeac +; VI-NEXT: s_addc_u32 s1, s3, 0xabcd ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6836,12 +6836,12 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; ; GFX9-LABEL: atomic_dec_i32_huge_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 0xdeac -; GFX9-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac +; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6856,8 +6856,8 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) { ; SI-LABEL: atomic_dec_i32_ret_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -6874,29 +6874,29 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i32_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_dec_i32_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6912,9 +6912,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dword s6, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dword s6, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 @@ -6929,18 +6929,18 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_atomic_dec v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -6948,12 +6948,12 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 @@ -6971,9 +6971,9 @@ entry: define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) { ; SI-LABEL: atomic_dec_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xf -; SI-NEXT: s_load_dword s2, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s6 @@ -6993,22 +6993,22 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_dec_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; VI-NEXT: s_add_u32 s0, s4, s0 ; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 @@ -7017,12 +7017,12 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 @@ -7042,7 +7042,7 @@ entry: define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_f16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_f16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7091,7 +7091,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_f16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7109,7 +7109,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; VI-LABEL: atomic_load_f16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7127,7 +7127,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; ; GFX9-LABEL: atomic_load_f16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc @@ -7144,7 +7144,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7160,7 +7160,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; VI-LABEL: atomic_load_bf16_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7176,7 +7176,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; ; GFX9-LABEL: atomic_load_bf16_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc @@ -7193,7 +7193,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; SI-LABEL: atomic_load_bf16_negoffset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s2 @@ -7211,7 +7211,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_bf16_negoffset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7229,7 +7229,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_bf16_negoffset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index a7ba8a084272b4..516c92f1640eae 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4616,7 +4616,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4679,7 +4679,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -4714,8 +4714,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4753,8 +4753,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4789,24 +4789,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -4829,7 +4829,7 @@ entry: define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -4861,7 +4861,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -4890,7 +4890,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -4924,8 +4924,8 @@ entry: define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_max_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -4963,8 +4963,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -4997,24 +4997,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -5869,7 +5869,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -5932,7 +5932,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -5967,8 +5967,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6006,8 +6006,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6042,24 +6042,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -6082,8 +6082,8 @@ entry: define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_umax_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -6121,8 +6121,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -6155,24 +6155,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_max_u32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -7860,7 +7860,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s3, 31 ; SI-NEXT: s_mov_b32 s4, s3 @@ -7892,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s5, s3, 31 ; VI-NEXT: s_mov_b32 s4, s3 @@ -7923,7 +7923,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s3, 31 @@ -7958,8 +7958,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -7997,8 +7997,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i32_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8033,24 +8033,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 @@ -8073,36 +8073,36 @@ entry: define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_min_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[0:1], 0x0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s3, s[4:5], 0x0 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s6, v1 +; SI-NEXT: v_min_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_min_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -8126,24 +8126,24 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: atomic_min_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 -; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -8155,8 +8155,8 @@ entry: define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) { ; SI-LABEL: atomic_min_i32_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ashr_i32 s5, s9, 31 ; SI-NEXT: s_mov_b32 s4, s9 @@ -8194,8 +8194,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i32_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s7, s5, 31 ; VI-NEXT: s_mov_b32 s6, s5 @@ -8228,24 +8228,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i32_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s3, s1, 31 -; GFX9-NEXT: s_mov_b32 s2, s1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: s_addc_u32 s3, s5, s3 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s0, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 40f0acf3d5d09f..d558393442643d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -20,7 +20,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_add_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -33,7 +33,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_add_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -45,7 +45,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -63,8 +63,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -82,8 +82,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -101,12 +101,12 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -116,8 +116,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -139,8 +139,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -156,8 +156,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_add_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -175,12 +175,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_add_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -192,8 +192,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -215,7 +215,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -236,7 +236,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_add_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -259,7 +259,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -275,7 +275,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -301,7 +301,7 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_add_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -316,7 +316,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_add_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -331,7 +331,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_add_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -343,7 +343,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -360,8 +360,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_add_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -379,8 +379,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_add_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -398,12 +398,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_add_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -413,8 +413,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_add_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -435,8 +435,8 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -452,8 +452,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_add_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -469,12 +469,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_add_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -486,8 +486,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -508,7 +508,7 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_add_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -529,7 +529,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_add_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -550,7 +550,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_add_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -566,7 +566,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -591,7 +591,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -604,7 +604,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_and_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -617,7 +617,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_and_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -629,7 +629,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -647,8 +647,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -666,8 +666,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -685,12 +685,12 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -700,8 +700,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -723,8 +723,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -740,8 +740,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_and_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -759,12 +759,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_and_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -776,8 +776,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -799,7 +799,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -820,7 +820,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_and_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -843,7 +843,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -859,7 +859,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -885,7 +885,7 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_and_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -900,7 +900,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_and_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -915,7 +915,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_and_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -927,7 +927,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -944,8 +944,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_and_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -963,8 +963,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_and_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -982,12 +982,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_and_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -997,8 +997,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_and_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1019,8 +1019,8 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1036,8 +1036,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_and_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1053,12 +1053,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_and_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1070,8 +1070,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1092,7 +1092,7 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_and_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1113,7 +1113,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_and_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_and_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1150,7 +1150,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1175,7 +1175,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1188,7 +1188,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_sub_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1201,7 +1201,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_sub_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1213,7 +1213,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1231,8 +1231,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1250,8 +1250,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,12 +1269,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1284,8 +1284,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1307,8 +1307,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1324,8 +1324,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_sub_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1343,12 +1343,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_sub_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1360,8 +1360,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1383,7 +1383,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1404,7 +1404,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_sub_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1427,7 +1427,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1443,7 +1443,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1469,7 +1469,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_sub_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_sub_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1499,7 +1499,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_sub_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1528,8 +1528,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_sub_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1547,8 +1547,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_sub_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,12 +1566,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -1581,8 +1581,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_sub_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1603,8 +1603,8 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1620,8 +1620,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_sub_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1637,12 +1637,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_sub_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1654,8 +1654,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1676,7 +1676,7 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_sub_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1697,7 +1697,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_sub_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1718,7 +1718,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_sub_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1734,7 +1734,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1759,7 +1759,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_max_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1781,7 +1781,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_max_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1791,7 +1791,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -1809,8 +1809,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1827,8 +1827,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1845,12 +1845,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -1859,8 +1859,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -1882,8 +1882,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1897,8 +1897,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -1914,12 +1914,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -1929,8 +1929,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1952,7 +1952,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1972,7 +1972,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -1994,7 +1994,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2009,7 +2009,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2035,7 +2035,7 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_max_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_max_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2061,7 +2061,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_max_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2071,7 +2071,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2088,8 +2088,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_max_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2106,8 +2106,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_max_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2124,12 +2124,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_max_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2138,8 +2138,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_max_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2160,8 +2160,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2175,8 +2175,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2190,12 +2190,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2205,8 +2205,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2227,7 +2227,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_max_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2247,7 +2247,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2282,7 +2282,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2307,7 +2307,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2318,7 +2318,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umax_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2329,7 +2329,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2339,7 +2339,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2357,8 +2357,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2375,8 +2375,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2393,12 +2393,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2407,8 +2407,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2430,8 +2430,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2445,8 +2445,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2462,12 +2462,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2477,8 +2477,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2500,7 +2500,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2520,7 +2520,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2557,7 +2557,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2583,7 +2583,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umax_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2596,7 +2596,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umax_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2609,7 +2609,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umax_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2619,7 +2619,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2636,8 +2636,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umax_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2654,8 +2654,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umax_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2672,12 +2672,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umax_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2686,8 +2686,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_umax_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2708,8 +2708,8 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2723,8 +2723,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umax_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2738,12 +2738,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umax_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -2753,8 +2753,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2775,7 +2775,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umax_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2795,7 +2795,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2815,7 +2815,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2830,7 +2830,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2855,7 +2855,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2866,7 +2866,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_min_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2877,7 +2877,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_min_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2887,7 +2887,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -2905,8 +2905,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2923,8 +2923,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2941,12 +2941,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -2955,8 +2955,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -2978,8 +2978,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -2993,8 +2993,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3010,12 +3010,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3025,8 +3025,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3048,7 +3048,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3068,7 +3068,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3090,7 +3090,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3105,7 +3105,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3131,7 +3131,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_min_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3144,7 +3144,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3157,7 +3157,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3167,7 +3167,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3184,8 +3184,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_min_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3202,8 +3202,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_min_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3220,12 +3220,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_min_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3234,8 +3234,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_min_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3256,8 +3256,8 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3271,8 +3271,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_min_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3286,12 +3286,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_min_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3301,8 +3301,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3323,7 +3323,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_min_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3363,7 +3363,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3378,7 +3378,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3403,7 +3403,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3414,7 +3414,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umin_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3425,7 +3425,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3435,7 +3435,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3453,8 +3453,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3471,8 +3471,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3489,12 +3489,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3503,8 +3503,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3526,8 +3526,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3541,8 +3541,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umin_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3558,12 +3558,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umin_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3573,8 +3573,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3596,7 +3596,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3616,7 +3616,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umin_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3638,7 +3638,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3653,7 +3653,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3679,7 +3679,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_umin_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3692,7 +3692,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_umin_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3705,7 +3705,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_umin_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3715,7 +3715,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -3732,8 +3732,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_umin_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3750,8 +3750,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_umin_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3768,12 +3768,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_umin_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] @@ -3782,8 +3782,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_umin_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3804,8 +3804,8 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3819,8 +3819,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_umin_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -3834,12 +3834,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_umin_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -3849,8 +3849,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3871,7 +3871,7 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_umin_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -3891,7 +3891,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umin_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -3911,7 +3911,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umin_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -3926,7 +3926,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -3951,7 +3951,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3964,7 +3964,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_or_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3977,7 +3977,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_or_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -3989,7 +3989,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4007,8 +4007,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4026,8 +4026,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4045,12 +4045,12 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4060,8 +4060,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4083,8 +4083,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4100,8 +4100,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; ; VI-LABEL: atomic_or_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4119,12 +4119,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; ; GFX9-LABEL: atomic_or_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4136,8 +4136,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4159,7 +4159,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4180,7 +4180,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; VI-LABEL: atomic_or_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4203,7 +4203,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4219,7 +4219,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4245,7 +4245,7 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_or_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4260,7 +4260,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_or_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4275,7 +4275,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_or_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4287,7 +4287,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4304,8 +4304,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_or_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4323,8 +4323,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: atomic_or_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4342,12 +4342,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: atomic_or_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4357,8 +4357,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-LABEL: atomic_or_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4379,8 +4379,8 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4396,8 +4396,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_or_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4413,12 +4413,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_or_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4430,8 +4430,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4452,7 +4452,7 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_or_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4473,7 +4473,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: atomic_or_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4494,7 +4494,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: atomic_or_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4510,7 +4510,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4535,7 +4535,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4548,7 +4548,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_xchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4561,7 +4561,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4573,7 +4573,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4591,7 +4591,7 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double %in) { ; CI-LABEL: atomic_xchg_f64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4604,7 +4604,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; VI-LABEL: atomic_xchg_f64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4617,7 +4617,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX9-LABEL: atomic_xchg_f64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4629,7 +4629,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4647,7 +4647,7 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr %in) { ; CI-LABEL: atomic_xchg_pointer_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4660,7 +4660,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_pointer_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4673,7 +4673,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_pointer_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4685,7 +4685,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -4703,8 +4703,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4722,8 +4722,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4741,12 +4741,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -4756,8 +4756,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4779,8 +4779,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4796,8 +4796,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_xchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -4815,12 +4815,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_xchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -4832,8 +4832,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4855,7 +4855,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4876,7 +4876,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_xchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4899,7 +4899,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -4915,7 +4915,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -4941,7 +4941,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -4956,7 +4956,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -4971,7 +4971,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4983,7 +4983,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5000,8 +5000,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5019,8 +5019,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: atomic_xchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5038,12 +5038,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: atomic_xchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5053,8 +5053,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-LABEL: atomic_xchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5075,8 +5075,8 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5092,8 +5092,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; VI-LABEL: atomic_xchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5109,12 +5109,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; ; GFX9-LABEL: atomic_xchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5126,8 +5126,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5148,7 +5148,7 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5169,7 +5169,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5190,7 +5190,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5206,7 +5206,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5231,7 +5231,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -5244,7 +5244,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_xor_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5257,7 +5257,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_xor_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5269,7 +5269,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5287,8 +5287,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5306,8 +5306,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5325,12 +5325,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5340,8 +5340,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5363,8 +5363,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5380,8 +5380,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_xor_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5399,12 +5399,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_xor_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5416,8 +5416,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5439,7 +5439,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5460,7 +5460,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_xor_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5483,7 +5483,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5499,7 +5499,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5525,7 +5525,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_xor_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5540,7 +5540,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_xor_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5555,7 +5555,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_xor_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -5567,7 +5567,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -5584,8 +5584,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_xor_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5603,8 +5603,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: atomic_xor_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5622,12 +5622,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5637,8 +5637,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-LABEL: atomic_xor_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5659,8 +5659,8 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5676,8 +5676,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_xor_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -5693,12 +5693,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_xor_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -5710,8 +5710,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5732,7 +5732,7 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; CI-LABEL: atomic_xor_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5753,7 +5753,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_xor_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -5774,7 +5774,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_xor_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -5790,7 +5790,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -5815,50 +5815,50 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s0, s4 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 offset:32 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5867,8 +5867,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -5887,52 +5887,52 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_soffset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, 0x11940 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s0, s4 +; CI-NEXT: s_mov_b32 s1, s5 +; CI-NEXT: s_mov_b32 s4, 0x11940 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_soffset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, 0x11940 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, 0x11940 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], s2 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64_soffset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x11000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:2368 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -5941,8 +5941,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -5961,7 +5961,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -5981,7 +5981,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_cmpxchg_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6001,7 +6001,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -6016,7 +6016,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -6040,7 +6040,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6059,7 +6059,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_cmpxchg_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -6079,7 +6079,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6096,7 +6096,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -6119,32 +6119,32 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; CI-NEXT: s_mov_b32 s15, 0xf000 -; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_mov_b32 s12, s6 -; CI-NEXT: s_mov_b32 s13, s7 +; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 +; CI-NEXT: v_mov_b32_e32 v4, s10 +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s15 +; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v2, s12 +; CI-NEXT: v_mov_b32_e32 v3, s13 +; CI-NEXT: v_mov_b32_e32 v5, s11 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -6169,9 +6169,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 @@ -6189,8 +6189,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 @@ -6218,50 +6218,50 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s0, s4 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: v_mov_b32_e32 v1, s7 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -6270,8 +6270,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6 ; GFX12-LABEL: atomic_cmpxchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1 @@ -6289,7 +6289,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6309,7 +6309,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: atomic_cmpxchg_i64_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6329,7 +6329,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -6344,7 +6344,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -6367,7 +6367,7 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6386,7 +6386,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; VI-LABEL: atomic_cmpxchg_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 @@ -6404,7 +6404,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX9-LABEL: atomic_cmpxchg_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6421,7 +6421,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7 @@ -6443,32 +6443,32 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) { ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; CI-NEXT: s_mov_b32 s15, 0xf000 -; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: s_mov_b32 s12, s6 -; CI-NEXT: s_mov_b32 s13, s7 +; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 +; CI-NEXT: v_mov_b32_e32 v4, s10 +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s15 +; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v2, s12 +; CI-NEXT: v_mov_b32_e32 v3, s13 +; CI-NEXT: v_mov_b32_e32 v5, s11 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; VI-NEXT: s_add_u32 s2, s4, s2 @@ -6491,9 +6491,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GFX9-NEXT: s_add_u32 s2, s4, s2 @@ -6511,8 +6511,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1 @@ -6539,7 +6539,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6555,7 +6555,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; VI-LABEL: atomic_load_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6573,7 +6573,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6584,7 +6584,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr ; ; GFX12-LABEL: atomic_load_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 scope:SCOPE_SYS @@ -6604,7 +6604,7 @@ entry: define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64_neg_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_not_b32_e32 v0, 31 ; CI-NEXT: v_mov_b32_e32 v1, -1 @@ -6622,7 +6622,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; VI-LABEL: atomic_load_i64_neg_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6640,7 +6640,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX9-LABEL: atomic_load_i64_neg_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc @@ -6651,7 +6651,7 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr ; ; GFX12-LABEL: atomic_load_i64_neg_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 scope:SCOPE_SYS @@ -6671,7 +6671,7 @@ entry: define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_load_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6687,7 +6687,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; VI-LABEL: atomic_load_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6703,7 +6703,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX9-LABEL: atomic_load_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc @@ -6714,7 +6714,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1 ; ; GFX12-LABEL: atomic_load_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] scope:SCOPE_DEV @@ -6733,8 +6733,8 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6753,8 +6753,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6775,11 +6775,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6791,8 +6791,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-LABEL: atomic_load_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6816,28 +6816,28 @@ entry: define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s6 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_mov_b32 s9, s7 +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_load_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6856,11 +6856,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: atomic_load_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc @@ -6872,8 +6872,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; GFX12-LABEL: atomic_load_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6896,8 +6896,8 @@ entry: define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_load_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6916,8 +6916,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; ; VI-LABEL: atomic_load_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6938,11 +6938,11 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; ; GFX9-LABEL: atomic_load_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc @@ -6954,8 +6954,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; GFX12-LABEL: atomic_load_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 @@ -6979,7 +6979,7 @@ entry: define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -6992,7 +6992,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; VI-LABEL: atomic_store_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s2, 32 @@ -7005,7 +7005,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -7015,7 +7015,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou ; ; GFX12-LABEL: atomic_store_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -7033,7 +7033,7 @@ entry: define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; CI-LABEL: atomic_store_i64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7046,7 +7046,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; VI-LABEL: atomic_store_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -7057,7 +7057,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX9-LABEL: atomic_store_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -7067,7 +7067,7 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) { ; ; GFX12-LABEL: atomic_store_i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -7084,8 +7084,8 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7100,8 +7100,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; ; VI-LABEL: atomic_store_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7117,12 +7117,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; ; GFX9-LABEL: atomic_store_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7132,8 +7132,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; GFX12-LABEL: atomic_store_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7155,24 +7155,24 @@ entry: define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_i64_addr64: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[8:11], 0 addr64 +; CI-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_store_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7186,12 +7186,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_store_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7201,8 +7201,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; GFX12-LABEL: atomic_store_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7223,8 +7223,8 @@ entry: define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrspace(1) %out, i64 %index) { ; CI-LABEL: atomic_store_f64_addr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7239,8 +7239,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; ; VI-LABEL: atomic_store_f64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7256,12 +7256,12 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; ; GFX9-LABEL: atomic_store_f64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s6, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s1, s7, s1 @@ -7271,8 +7271,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; GFX12-LABEL: atomic_store_f64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -7294,7 +7294,7 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_inc_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7307,7 +7307,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_inc_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7320,7 +7320,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_inc_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -7332,7 +7332,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7350,8 +7350,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_inc_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7369,8 +7369,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_inc_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7388,12 +7388,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_inc_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7403,8 +7403,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -7426,8 +7426,8 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_inc_i64_incr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7443,8 +7443,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_inc_i64_incr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7462,12 +7462,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_inc_i64_incr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -7479,8 +7479,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -7502,7 +7502,7 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: atomic_dec_i64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -7515,7 +7515,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; VI-LABEL: atomic_dec_i64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -7528,7 +7528,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX9-LABEL: atomic_dec_i64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -7540,7 +7540,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 @@ -7558,8 +7558,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { ; CI-LABEL: atomic_dec_i64_ret_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -7577,8 +7577,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_dec_i64_ret_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7596,12 +7596,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_dec_i64_ret_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol @@ -7611,8 +7611,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 @@ -7634,8 +7634,8 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; CI-LABEL: atomic_dec_i64_decr64_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s6 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7651,8 +7651,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_dec_i64_decr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 @@ -7670,12 +7670,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_dec_i64_decr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 @@ -7687,8 +7687,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 8897ad3e950a58..cafd35afea6ebd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -4866,8 +4866,8 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -4905,8 +4905,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -4941,15 +4941,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4981,7 +4981,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5025,7 +5025,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_max_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5064,7 +5064,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -5107,8 +5107,8 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -5146,8 +5146,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; VI-LABEL: atomic_max_i64_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s4, s0, s4 @@ -5180,15 +5180,15 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; ; GFX9-LABEL: atomic_max_i64_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5219,7 +5219,7 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_max_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_max_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -5300,7 +5300,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_max_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6328,8 +6328,8 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -6367,8 +6367,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6403,15 +6403,15 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6443,7 +6443,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6487,7 +6487,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; VI-LABEL: atomic_umax_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6526,7 +6526,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -6569,7 +6569,7 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_umax_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -6613,7 +6613,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_umax_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -6650,7 +6650,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_umax_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8664,8 +8664,8 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; SI-NEXT: s_add_u32 s4, s0, s4 @@ -8703,8 +8703,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i64_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8739,15 +8739,15 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i64_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, s1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8779,7 +8779,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -8823,7 +8823,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; VI-LABEL: atomic_min_i64_ret_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8862,7 +8862,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 @@ -8905,7 +8905,7 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: atomic_min_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[8:9], 0 @@ -8942,7 +8942,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; VI-LABEL: atomic_min_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; ; GFX9-LABEL: atomic_min_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9006,7 +9006,7 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) { ; SI-LABEL: atomic_min_i64_ret_addr64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; SI-NEXT: s_add_u32 s8, s0, s6 @@ -9050,7 +9050,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; VI-LABEL: atomic_min_i64_ret_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; VI-NEXT: s_add_u32 s6, s0, s6 @@ -9087,7 +9087,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: atomic_min_i64_ret_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index c05f9c679979da..86092a2666639b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -141,30 +141,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1164-NEXT: .LBB0_2: ; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -172,20 +172,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -193,15 +193,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -229,23 +229,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -261,23 +261,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -293,22 +293,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -316,30 +316,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1164-DPP-NEXT: .LBB0_2: ; GFX1164-DPP-NEXT: s_nop 0 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -347,20 +347,20 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX1132-DPP-NEXT: .LBB0_2: ; GFX1132-DPP-NEXT: s_nop 0 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1165,21 +1165,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1211,25 +1211,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1253,31 +1253,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1293,29 +1293,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1324,19 +1324,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1375,12 +1375,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -1390,14 +1390,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1406,30 +1406,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1461,25 +1461,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1503,31 +1503,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -1543,29 +1543,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1574,19 +1574,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1598,7 +1598,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1625,12 +1625,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1640,14 +1640,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1656,9 +1656,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2515,21 +2515,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2561,25 +2561,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2603,31 +2603,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2643,29 +2643,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2674,19 +2674,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -2698,7 +2698,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2725,12 +2725,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -2740,14 +2740,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2756,30 +2756,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2811,25 +2811,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2853,31 +2853,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -2893,29 +2893,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2924,19 +2924,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2948,7 +2948,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2975,12 +2975,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -2990,14 +2990,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3006,9 +3006,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -4607,21 +4607,21 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4653,25 +4653,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4695,31 +4695,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4735,29 +4735,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4766,19 +4766,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -4790,7 +4790,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -4817,12 +4817,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -4832,14 +4832,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4848,30 +4848,30 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4903,25 +4903,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4945,31 +4945,31 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -4985,29 +4985,29 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5016,19 +5016,19 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -5067,12 +5067,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -5082,14 +5082,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5098,9 +5098,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -5957,53 +5957,46 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 -; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -6011,138 +6004,118 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6150,76 +6123,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6227,69 +6190,61 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6302,8 +6257,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -6313,17 +6268,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6331,8 +6282,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6340,26 +6291,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6371,8 +6318,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -6380,24 +6327,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6406,53 +6349,46 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -6460,138 +6396,118 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -6599,76 +6515,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -6676,69 +6582,61 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6751,8 +6649,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -6762,17 +6660,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6780,8 +6674,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6789,26 +6683,22 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6820,8 +6710,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -6829,24 +6719,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8189,17 +8075,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8234,25 +8120,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8277,23 +8163,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8318,30 +8204,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8350,19 +8236,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -8374,7 +8260,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8402,12 +8288,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -8417,14 +8303,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8433,9 +8319,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm @@ -8446,17 +8332,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8491,25 +8377,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8534,23 +8420,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8575,30 +8461,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -8607,19 +8493,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8631,7 +8517,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8659,12 +8545,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8674,14 +8560,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8690,9 +8576,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9648,17 +9534,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -9693,25 +9579,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9736,23 +9622,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -9777,30 +9663,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -9809,19 +9695,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -9833,7 +9719,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -9861,12 +9747,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -9876,14 +9762,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9892,9 +9778,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -9905,17 +9791,17 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -9950,25 +9836,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9993,23 +9879,23 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -10034,30 +9920,30 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] @@ -10066,19 +9952,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -10090,7 +9976,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -10118,12 +10004,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -10133,14 +10019,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10149,9 +10035,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -12045,55 +11931,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 -; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -12101,140 +11980,120 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -12242,76 +12101,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -12319,48 +12168,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12381,15 +12226,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -12400,8 +12241,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -12411,17 +12252,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12429,8 +12266,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12438,12 +12275,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12458,14 +12295,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12475,8 +12308,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -12484,24 +12317,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12510,55 +12339,48 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 -; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -12566,140 +12388,120 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12707,76 +12509,66 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12784,48 +12576,44 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12846,15 +12634,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12865,8 +12649,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -12876,17 +12660,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12894,8 +12674,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12903,12 +12683,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12923,14 +12703,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12940,8 +12716,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -12949,24 +12725,20 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -14305,15 +14077,15 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -14341,23 +14113,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14373,23 +14145,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14405,22 +14177,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14428,33 +14200,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-NEXT: .LBB18_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB18_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14472,23 +14244,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB18_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14497,24 +14269,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1132-NEXT: .LBB18_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -14542,23 +14314,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14574,23 +14346,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14606,22 +14378,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14629,33 +14401,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1032-DPP-NEXT: .LBB18_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14673,23 +14445,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14698,9 +14470,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2 ; GFX1132-DPP-NEXT: .LBB18_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -14711,15 +14483,15 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -14747,23 +14519,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14779,23 +14551,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14811,22 +14583,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14834,33 +14606,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-NEXT: .LBB19_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB19_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14878,23 +14650,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB19_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14903,24 +14675,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1132-NEXT: .LBB19_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -14948,23 +14720,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -14980,23 +14752,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -15012,22 +14784,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 @@ -15035,33 +14807,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1032-DPP-NEXT: .LBB19_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15079,23 +14851,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15104,9 +14876,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2 ; GFX1132-DPP-NEXT: .LBB19_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 46f0bb03938857..388d57cc488598 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,10 +54,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -83,10 +83,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -98,10 +98,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +112,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -131,12 +131,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -183,10 +183,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -212,10 +212,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -227,10 +227,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -241,13 +241,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -260,12 +260,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1] @@ -1085,10 +1085,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1118,10 +1118,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,10 +1147,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1174,17 +1174,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1193,8 +1193,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1202,13 +1202,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1234,18 +1234,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1255,9 +1255,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm @@ -1267,10 +1267,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -1300,10 +1300,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1329,10 +1329,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1356,17 +1356,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1375,8 +1375,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1384,13 +1384,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1416,18 +1416,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1437,9 +1437,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2376,10 +2376,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2409,10 +2409,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2438,10 +2438,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2465,17 +2465,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2484,8 +2484,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2493,13 +2493,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2525,18 +2525,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2546,9 +2546,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2558,10 +2558,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -2591,10 +2591,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,10 +2620,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2647,17 +2647,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2666,8 +2666,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2675,13 +2675,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2707,18 +2707,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2728,9 +2728,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3663,51 +3663,44 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -3715,130 +3708,110 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3846,73 +3819,63 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3920,42 +3883,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3964,7 +3923,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3972,15 +3931,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3991,8 +3946,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4002,18 +3957,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4021,8 +3972,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4032,21 +3983,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -4056,8 +4003,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4065,26 +4012,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4093,51 +4036,44 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -4145,130 +4081,110 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -4276,73 +4192,63 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -4350,42 +4256,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -4394,7 +4296,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4402,15 +4304,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -4421,8 +4319,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4432,18 +4330,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4451,8 +4345,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4462,21 +4356,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -4486,8 +4376,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4495,26 +4385,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5934,10 +5820,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5971,10 +5857,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6002,10 +5888,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6031,18 +5917,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6052,8 +5938,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm @@ -6061,13 +5947,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6095,18 +5981,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6116,9 +6002,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -6128,10 +6014,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -6165,10 +6051,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6196,10 +6082,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6225,18 +6111,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6246,8 +6132,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6255,13 +6141,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6289,18 +6175,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6310,9 +6196,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -7346,51 +7232,44 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -7398,130 +7277,110 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 ; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -7529,73 +7388,63 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -7603,42 +7452,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -7647,7 +7492,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7655,15 +7500,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -7674,8 +7515,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -7685,18 +7526,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7704,8 +7541,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7715,21 +7552,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7739,8 +7572,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -7748,26 +7581,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7776,51 +7605,44 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -7828,130 +7650,110 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7959,73 +7761,63 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -8033,42 +7825,38 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8077,7 +7865,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8085,15 +7873,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -8104,8 +7888,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -8115,18 +7899,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8134,8 +7914,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8145,21 +7925,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -8169,8 +7945,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -8178,26 +7954,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -9617,10 +9389,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -9650,10 +9422,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9679,10 +9451,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -9706,17 +9478,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -9725,8 +9497,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm @@ -9734,13 +9506,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -9766,18 +9538,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9787,9 +9559,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm @@ -9799,10 +9571,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -9832,10 +9604,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9861,10 +9633,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -9888,17 +9660,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -9907,8 +9679,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -9916,13 +9688,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -9948,18 +9720,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9969,9 +9741,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9985,10 +9757,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -10018,10 +9790,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10047,10 +9819,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -10074,17 +9846,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -10093,8 +9865,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -10102,13 +9874,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -10134,18 +9906,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10155,9 +9927,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -10167,10 +9939,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -10200,10 +9972,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10229,10 +10001,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -10256,17 +10028,17 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -10275,8 +10047,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -10284,13 +10056,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -10316,18 +10088,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10337,9 +10109,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index bd5e589ec2be7f..ff649ebffb6ff1 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,10 +54,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -83,10 +83,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -98,10 +98,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -112,13 +112,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -131,12 +131,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -183,10 +183,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -212,10 +212,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -227,10 +227,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -241,13 +241,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -260,12 +260,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1] @@ -1085,10 +1085,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -1118,10 +1118,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,10 +1147,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1174,17 +1174,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1193,8 +1193,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm @@ -1202,13 +1202,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1234,18 +1234,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1255,9 +1255,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm @@ -1267,10 +1267,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -1300,10 +1300,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1329,10 +1329,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -1356,17 +1356,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -1375,8 +1375,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -1384,13 +1384,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1416,18 +1416,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1437,9 +1437,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2376,10 +2376,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -2409,10 +2409,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2438,10 +2438,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2465,17 +2465,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2484,8 +2484,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm @@ -2493,13 +2493,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2525,18 +2525,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2546,9 +2546,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm @@ -2558,10 +2558,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -2591,10 +2591,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,10 +2620,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -2647,17 +2647,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -2666,8 +2666,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -2675,13 +2675,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2707,18 +2707,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2728,9 +2728,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -3663,51 +3663,44 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -3715,130 +3708,110 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-NEXT: .LBB6_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-NEXT: .LBB6_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -3846,73 +3819,63 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-NEXT: .LBB6_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -3920,42 +3883,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-NEXT: .LBB6_3: ; GFX1032-NEXT: s_endpgm @@ -3964,7 +3923,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3972,15 +3931,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -3991,8 +3946,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4002,18 +3957,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4021,8 +3972,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-NEXT: .LBB6_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4032,21 +3983,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -4056,8 +4003,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4065,26 +4012,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-NEXT: .LBB6_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4093,51 +4036,44 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -4145,130 +4081,110 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX7LESS-DPP-NEXT: .LBB6_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX9-DPP-NEXT: .LBB6_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -4276,73 +4192,63 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1064-DPP-NEXT: .LBB6_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -4350,42 +4256,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1032-DPP-NEXT: .LBB6_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -4394,7 +4296,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4402,15 +4304,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -4421,8 +4319,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4432,18 +4330,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4451,8 +4345,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1164-DPP-NEXT: .LBB6_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -4462,21 +4356,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -4486,8 +4376,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -4495,26 +4385,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 ; GFX1132-DPP-NEXT: .LBB6_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -5934,10 +5820,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -5971,10 +5857,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6002,10 +5888,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6031,18 +5917,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6052,8 +5938,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-NEXT: .LBB8_3: ; GFX1032-NEXT: s_endpgm @@ -6061,13 +5947,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6095,18 +5981,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6116,9 +6002,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-NEXT: .LBB8_3: ; GFX1132-NEXT: s_endpgm @@ -6128,10 +6014,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -6165,10 +6051,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6196,10 +6082,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -6225,18 +6111,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] @@ -6246,8 +6132,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1032-DPP-NEXT: .LBB8_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -6255,13 +6141,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -6289,18 +6175,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6310,9 +6196,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 ; GFX1132-DPP-NEXT: .LBB8_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -7346,51 +7232,44 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -7398,130 +7277,110 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-NEXT: .LBB10_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 ; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-NEXT: .LBB10_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -7529,73 +7388,63 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-NEXT: .LBB10_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -7603,42 +7452,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-NEXT: .LBB10_3: ; GFX1032-NEXT: s_endpgm @@ -7647,7 +7492,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7655,15 +7500,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -7674,8 +7515,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -7685,18 +7526,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7704,8 +7541,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-NEXT: .LBB10_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7715,21 +7552,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -7739,8 +7572,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -7748,26 +7581,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-NEXT: .LBB10_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -7776,51 +7605,44 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -7828,130 +7650,110 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX7LESS-DPP-NEXT: .LBB10_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 ; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX9-DPP-NEXT: .LBB10_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -7959,73 +7761,63 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1064-DPP-NEXT: .LBB10_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -8033,42 +7825,38 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1032-DPP-NEXT: .LBB10_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -8077,7 +7865,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8085,15 +7873,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -8104,8 +7888,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -8115,18 +7899,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8134,8 +7914,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1164-DPP-NEXT: .LBB10_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8145,21 +7925,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -8169,8 +7945,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -8178,26 +7954,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 ; GFX1132-DPP-NEXT: .LBB10_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -9617,10 +9389,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -9650,10 +9422,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9679,10 +9451,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -9706,17 +9478,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -9725,8 +9497,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-NEXT: .LBB12_3: ; GFX1032-NEXT: s_endpgm @@ -9734,13 +9506,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -9766,18 +9538,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9787,9 +9559,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-NEXT: .LBB12_3: ; GFX1132-NEXT: s_endpgm @@ -9799,10 +9571,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -9832,10 +9604,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9861,10 +9633,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -9888,17 +9660,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -9907,8 +9679,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1032-DPP-NEXT: .LBB12_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -9916,13 +9688,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -9948,18 +9720,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9969,9 +9741,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 ; GFX1132-DPP-NEXT: .LBB12_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9985,10 +9757,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -10018,10 +9790,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10047,10 +9819,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -10074,17 +9846,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 @@ -10093,8 +9865,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm @@ -10102,13 +9874,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164: ; %bb.0: ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -10134,18 +9906,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132: ; %bb.0: ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10155,9 +9927,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -10167,10 +9939,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -10200,10 +9972,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10229,10 +10001,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -10256,17 +10028,17 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1032-DPP: ; %bb.0: ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 @@ -10275,8 +10047,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm @@ -10284,13 +10056,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1164-DPP: ; %bb.0: ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -10316,18 +10088,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX1132-DPP: ; %bb.0: ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10337,9 +10109,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 5ffa71d37164c3..0098146894ca97 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -18,15 +18,15 @@ declare double @div.double.value() define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 @@ -54,23 +54,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -118,22 +118,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -141,33 +141,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-NEXT: .LBB0_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -185,23 +185,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -210,24 +210,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-NEXT: .LBB0_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0 @@ -255,23 +255,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -287,23 +287,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -319,22 +319,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -342,33 +342,33 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1032-DPP-NEXT: .LBB0_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -386,23 +386,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 ; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -411,9 +411,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 ; GFX1132-DPP-NEXT: .LBB0_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -1269,21 +1269,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1315,25 +1315,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1357,31 +1357,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1397,29 +1397,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1428,19 +1428,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-NEXT: .LBB2_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1479,12 +1479,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -1494,14 +1494,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1510,30 +1510,30 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-NEXT: .LBB2_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -1565,25 +1565,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1607,31 +1607,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -1647,29 +1647,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1678,19 +1678,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1032-DPP-NEXT: .LBB2_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1702,7 +1702,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -1729,12 +1729,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -1744,14 +1744,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1760,9 +1760,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 ; GFX1132-DPP-NEXT: .LBB2_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -2619,21 +2619,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2665,25 +2665,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2707,31 +2707,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2747,29 +2747,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2778,19 +2778,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-NEXT: .LBB4_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -2802,7 +2802,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -2829,12 +2829,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -2844,14 +2844,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2860,30 +2860,30 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-NEXT: .LBB4_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -2915,25 +2915,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2957,31 +2957,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -2997,29 +2997,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3028,19 +3028,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1032-DPP-NEXT: .LBB4_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -3052,7 +3052,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -3079,12 +3079,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -3094,14 +3094,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3110,9 +3110,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 ; GFX1132-DPP-NEXT: .LBB4_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -4815,21 +4815,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -4861,25 +4861,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4903,31 +4903,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -4943,29 +4943,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4974,19 +4974,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-NEXT: .LBB7_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -4998,7 +4998,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -5025,12 +5025,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -5040,14 +5040,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5056,30 +5056,30 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-NEXT: .LBB7_3: ; GFX1132-NEXT: s_endpgm ; ; GFX7LESS-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -5111,25 +5111,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5153,31 +5153,31 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 @@ -5193,29 +5193,29 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5224,19 +5224,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1032-DPP-NEXT: .LBB7_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -5248,7 +5248,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -5275,12 +5275,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -5290,14 +5290,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5306,9 +5306,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2 ; GFX1132-DPP-NEXT: .LBB7_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -6165,53 +6165,46 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 -; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -6219,138 +6212,118 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-NEXT: .LBB9_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s40, s7 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], -v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-NEXT: .LBB9_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -6358,76 +6331,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-NEXT: .LBB9_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -6435,69 +6398,61 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-NEXT: .LBB9_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_mov_b32 s32, 32 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -6510,8 +6465,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -6521,17 +6476,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6539,8 +6490,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-NEXT: .LBB9_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6548,26 +6499,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_mov_b32 s32, 32 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -6579,8 +6526,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -6588,24 +6535,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-NEXT: .LBB9_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6614,53 +6557,46 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3 -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v4, v3 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -6668,138 +6604,118 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX7LESS-DPP-NEXT: .LBB9_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24 -; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) -; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[3:4], -v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 -; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] -; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX9-DPP-NEXT: .LBB9_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -6807,76 +6723,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1064-DPP-NEXT: .LBB9_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -6884,69 +6790,61 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1032-DPP-NEXT: .LBB9_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6959,8 +6857,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -6970,17 +6868,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6988,8 +6882,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1164-DPP-NEXT: .LBB9_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -6997,26 +6891,22 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 -; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6 -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7028,8 +6918,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -7037,24 +6927,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 ; GFX1132-DPP-NEXT: .LBB9_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -8397,17 +8283,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8442,25 +8328,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8485,23 +8371,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8526,30 +8412,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8558,19 +8444,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-NEXT: .LBB11_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -8582,7 +8468,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8610,12 +8496,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -8625,14 +8511,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8641,9 +8527,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-NEXT: .LBB11_3: ; GFX1132-NEXT: s_endpgm @@ -8654,17 +8540,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -8699,25 +8585,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -8742,23 +8628,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -8783,30 +8669,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -8815,19 +8701,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1032-DPP-NEXT: .LBB11_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8839,7 +8725,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -8867,12 +8753,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -8882,14 +8768,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8898,9 +8784,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 ; GFX1132-DPP-NEXT: .LBB11_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -9855,17 +9741,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -9900,25 +9786,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9943,23 +9829,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s14, -1 -; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -9984,30 +9870,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10016,19 +9902,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-NEXT: .LBB13_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-NEXT: scratch_store_b32 off, v1, off @@ -10040,7 +9926,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -10068,12 +9954,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-NEXT: s_mov_b32 s4, 0 -; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-NEXT: scratch_store_b32 off, v1, off @@ -10083,14 +9969,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10099,9 +9985,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-NEXT: .LBB13_3: ; GFX1132-NEXT: s_endpgm @@ -10112,17 +9998,17 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s3 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[2:3] ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 @@ -10157,25 +10043,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s14, -1 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 -; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10200,23 +10086,23 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1] -; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -10241,30 +10127,30 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 -; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0 -; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7] +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] @@ -10273,19 +10159,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1032-DPP-NEXT: .LBB13_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off @@ -10297,7 +10183,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1164-DPP-NEXT: ; %bb.1: ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -10325,12 +10211,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s4, 0 -; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off @@ -10340,14 +10226,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: ; %bb.1: ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10356,9 +10242,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 ; GFX1132-DPP-NEXT: .LBB13_3: ; GFX1132-DPP-NEXT: s_endpgm @@ -12251,55 +12137,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS: ; %bb.0: ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 -; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -12307,140 +12186,120 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-NEXT: .LBB16_3: ; GFX7LESS-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s50, -1 -; GFX9-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-NEXT: s_add_u32 s48, s48, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] ; GFX9-NEXT: s_movk_i32 s32, 0x800 -; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-NEXT: s_mov_b32 s41, s6 -; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX9-NEXT: s_add_u32 s8, s36, 44 -; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-NEXT: s_mov_b32 s12, s41 -; GFX9-NEXT: s_mov_b32 s13, s40 -; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-NEXT: .LBB16_3: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-NEXT: s_mov_b32 s50, -1 -; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] ; GFX1064-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-NEXT: s_mov_b32 s33, s8 -; GFX1064-NEXT: s_mov_b32 s40, s7 -; GFX1064-NEXT: s_mov_b32 s41, s6 -; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0 @@ -12448,76 +12307,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-NEXT: s_mov_b32 s12, s41 -; GFX1064-NEXT: s_mov_b32 s13, s40 -; GFX1064-NEXT: s_mov_b32 s14, s33 -; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-NEXT: .LBB16_3: ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-NEXT: s_mov_b32 s44, 0 -; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-NEXT: s_mov_b32 s33, s8 -; GFX1032-NEXT: s_mov_b32 s40, s7 -; GFX1032-NEXT: s_mov_b32 s41, s6 -; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0 @@ -12525,48 +12374,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-NEXT: s_getpc_b64 s[0:1] ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-NEXT: s_mov_b32 s12, s41 -; GFX1032-NEXT: s_mov_b32 s13, s40 -; GFX1032-NEXT: s_mov_b32 s14, s33 -; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-NEXT: .LBB16_3: ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164: ; %bb.0: -; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -12587,15 +12432,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-NEXT: s_mov_b32 s33, s8 -; GFX1164-NEXT: s_mov_b32 s40, s7 -; GFX1164-NEXT: s_mov_b32 s41, s6 -; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0 @@ -12606,8 +12447,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-NEXT: s_getpc_b64 s[0:1] ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -12617,17 +12458,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-NEXT: s_mov_b32 s12, s41 -; GFX1164-NEXT: s_mov_b32 s13, s40 -; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s12, s33 ; GFX1164-NEXT: s_clause 0x1 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -12635,8 +12472,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-NEXT: .LBB16_3: ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12644,12 +12481,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132: ; %bb.0: -; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_mov_b32 s38, 0 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -12664,14 +12501,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-NEXT: s_mov_b32 s33, s15 -; GFX1132-NEXT: s_mov_b32 s40, s14 -; GFX1132-NEXT: s_mov_b32 s41, s13 -; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 @@ -12681,8 +12514,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-NEXT: s_getpc_b64 s[0:1] ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -12690,24 +12523,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-NEXT: s_mov_b32 s12, s41 -; GFX1132-NEXT: s_mov_b32 s13, s40 -; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_mov_b32 s12, s33 ; GFX1132-NEXT: s_clause 0x1 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-NEXT: .LBB16_3: ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 @@ -12716,55 +12545,48 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX7LESS-DPP: ; %bb.0: ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1 -; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000 -; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] -; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v2, exec_lo, 0 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 -; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5 -; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] -; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 +; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX7LESS-DPP-NEXT: ; %bb.1: -; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8 -; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7 -; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9 +; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1 -; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2 ; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] -; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 -; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 -; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 -; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2) ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0 @@ -12772,140 +12594,120 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41 -; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40 -; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s33 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0 -; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX7LESS-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX7LESS-DPP-NEXT: .LBB16_3: ; GFX7LESS-DPP-NEXT: s_endpgm ; ; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX9-DPP: ; %bb.0: -; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX9-DPP-NEXT: s_mov_b32 s50, -1 -; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 -; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX9-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4] +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX9-DPP-NEXT: ; %bb.1: -; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX9-DPP-NEXT: s_mov_b32 s33, s8 -; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s41, s6 -; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX9-DPP-NEXT: s_mov_b32 s12, s41 -; GFX9-DPP-NEXT: s_mov_b32 s13, s40 -; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 -; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX9-DPP-NEXT: .LBB16_3: ; GFX9-DPP-NEXT: s_endpgm ; ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1064-DPP: ; %bb.0: -; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 -; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 -; GFX1064-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] -; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12913,76 +12715,66 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1064-DPP-NEXT: s_clause 0x1 -; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1064-DPP-NEXT: .LBB16_3: ; GFX1064-DPP-NEXT: s_endpgm ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 -; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 -; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 -; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4] -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2 -; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1 -; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3 -; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -12990,48 +12782,44 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] -; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 -; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX1032-DPP-NEXT: s_clause 0x1 -; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1032-DPP-NEXT: .LBB16_3: ; GFX1032-DPP-NEXT: s_endpgm ; ; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1164-DPP: ; %bb.0: -; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 @@ -13052,15 +12840,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1164-DPP-NEXT: ; %bb.1: -; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 -; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 -; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 -; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 -; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] -; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 @@ -13071,8 +12855,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -13082,17 +12866,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1164-DPP-NEXT: s_clause 0x1 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 -; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -13100,8 +12880,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] -; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1164-DPP-NEXT: .LBB16_3: ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 @@ -13109,12 +12889,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; ; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp: ; GFX1132-DPP: ; %bb.0: -; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 -; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 @@ -13129,14 +12909,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 ; GFX1132-DPP-NEXT: ; %bb.1: -; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 -; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 -; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 -; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] -; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 @@ -13146,8 +12922,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] -; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44 -; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 @@ -13155,24 +12931,20 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] -; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] -; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 -; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 -; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 ; GFX1132-DPP-NEXT: s_clause 0x1 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 -; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 -; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 ; GFX1132-DPP-NEXT: .LBB16_3: ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll index e41634402c0c2b..3b71e8ffefbf8c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -71,7 +71,7 @@ bb: ; uniform load dominated by no-alias store - scalarize ; CHECK-LABEL: @no_memdep_alias_arg -; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[6:7], 0x0 +; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[4:5], 0x0 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], s[[[IN_LO]]:[[IN_HI]]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] @@ -100,7 +100,7 @@ define amdgpu_kernel void @memdep(ptr addrspace(1) %in, [8 x i32], ptr addrspace ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK-DAG: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 ; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[6:7], 0x0 +; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 ; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 3735c6349fbb31..e2d55990473c09 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: load_f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -19,8 +19,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; ; VI-LABEL: load_f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -31,10 +31,10 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; GFX11-LABEL: load_f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -46,8 +46,8 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: load_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -57,8 +57,8 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; ; VI-LABEL: load_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -69,10 +69,10 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; GFX11-LABEL: load_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -84,7 +84,7 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; ; GFX11-LABEL: load_v3f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -130,7 +130,7 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -145,12 +145,12 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: load_v8f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -159,12 +159,12 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; ; VI-LABEL: load_v8f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -174,8 +174,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-LABEL: load_v8f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 @@ -191,8 +191,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 { ; CI-LABEL: extload_v2f16_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -204,8 +204,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -218,13 +218,13 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; GFX11-LABEL: extload_v2f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -237,8 +237,8 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -260,11 +260,11 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; GFX11-LABEL: extload_f16_to_f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -277,8 +277,8 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -290,8 +290,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -304,13 +304,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; GFX11-LABEL: extload_v2f16_to_v2f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -323,7 +323,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -336,7 +336,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -349,7 +349,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -368,7 +368,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -383,7 +383,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 @@ -419,8 +419,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -447,8 +447,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -476,8 +476,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; GFX11-LABEL: extload_v8f16_to_v8f32_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s8, s7, 16 @@ -506,10 +506,10 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -519,10 +519,10 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -532,14 +532,14 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; ; GFX11-LABEL: extload_f16_to_f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -552,12 +552,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -568,12 +568,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[6:7], 0x8 +; VI-NEXT: s_load_dword s0, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -584,17 +584,17 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; ; GFX11-LABEL: extload_v2f16_to_v2f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s1, s0, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -607,7 +607,7 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -628,7 +628,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -649,7 +649,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -675,7 +675,7 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -725,7 +725,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s5, s3, 16 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 @@ -754,8 +754,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { ; CI-LABEL: extload_v8f16_to_v8f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -801,8 +801,8 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -848,20 +848,22 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; ; GFX11-LABEL: extload_v8f16_to_v8f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s9, s7, 16 ; GFX11-NEXT: s_lshr_b32 s8, s6, 16 -; GFX11-NEXT: s_lshr_b32 s1, s5, 16 +; GFX11-NEXT: s_lshr_b32 s3, s5, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX11-NEXT: s_lshr_b32 s0, s4, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s2 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 @@ -870,9 +872,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 @@ -889,7 +889,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -902,7 +902,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: global_load_store_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -932,7 +932,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -949,7 +949,7 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -962,7 +962,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; ; GFX11-LABEL: global_load_store_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] @@ -979,7 +979,7 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: global_load_store_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1042,7 +1042,7 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] @@ -1095,7 +1095,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1128,7 +1128,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] @@ -1151,7 +1151,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] @@ -1212,7 +1212,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -1358,7 +1358,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v20, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: global_extload_f16_to_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1493,7 +1493,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v2f16_to_v2f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] @@ -1553,7 +1553,7 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,7 +1602,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v3f16_to_v3f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v4f16_to_v4f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] @@ -1718,7 +1718,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1810,7 +1810,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; ; GFX11-LABEL: global_extload_v8f16_to_v8f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,7 +1947,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2033,7 +2033,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; ; GFX11-LABEL: global_extload_v16f16_to_v16f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2102,7 +2102,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2116,7 +2116,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; ; GFX11-LABEL: global_truncstore_f32_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2135,7 +2135,7 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2152,7 +2152,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2168,7 +2168,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -2190,7 +2190,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2213,7 +2213,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2235,7 +2235,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] @@ -2260,7 +2260,7 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2300,7 +2300,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2325,7 +2325,7 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2391,7 +2391,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; ; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2425,7 +2425,7 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; ; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -2606,12 +2606,12 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 { ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s0, s[6:7], 0x2 +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2622,8 +2622,8 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2636,13 +2636,13 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; GFX11-LABEL: fadd_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_add_f16_e64 v1, s4, s2 +; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 { ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2676,7 +2676,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 @@ -2709,7 +2709,7 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX11-LABEL: fadd_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -2787,8 +2787,8 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2845,8 +2845,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s15, 16 ; VI-NEXT: s_lshr_b32 s3, s11, 16 @@ -2888,8 +2888,8 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; GFX11-LABEL: fadd_v8f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v3, s7, s11 @@ -2908,7 +2908,7 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2921,7 +2921,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; ; GFX11-LABEL: test_bitcast_from_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] @@ -2939,7 +2939,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2952,7 +2952,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: test_bitcast_to_half: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll index 380a8e911e4995..b6eff8846dc8c7 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll @@ -98,4 +98,4 @@ bb: ret void } -attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll index 10c5ffd0eb07e6..6a49eac134a67b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-heap-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll index 677584caa8b2e6..6f4c8911efd33b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll index 1a5a7698e2f96d..01f8fbfd76314a 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 8c017fa5ec2636..7a9f4ae8a20fae 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -15,7 +15,7 @@ ; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 -; CHECK: .sgpr_count: 10 +; CHECK: .sgpr_count: 6 ; CHECK: .symbol: test.kd ; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 @@ -23,7 +23,7 @@ define amdgpu_kernel void @test( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" { + ptr addrspace(1) %b) { entry: %a.val = load half, ptr addrspace(1) %a %b.val = load half, ptr addrspace(1) %b @@ -47,10 +47,10 @@ entry: } ; CHECK: .name: num_spilled_sgprs -; GFX700: .sgpr_spill_count: 10 -; GFX803: .sgpr_spill_count: 10 -; GFX900: .sgpr_spill_count: 62 -; GFX1010: .sgpr_spill_count: 60 +; GFX700: .sgpr_spill_count: 12 +; GFX803: .sgpr_spill_count: 12 +; GFX900: .sgpr_spill_count: 48 +; GFX1010: .sgpr_spill_count: 48 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( ptr addrspace(1) %out0, ptr addrspace(1) %out1, [8 x i32], @@ -61,37 +61,27 @@ define amdgpu_kernel void @num_spilled_sgprs( ptr addrspace(1) %outa, ptr addrspace(1) %outb, [8 x i32], ptr addrspace(1) %outc, ptr addrspace(1) %outd, [8 x i32], ptr addrspace(1) %oute, ptr addrspace(1) %outf, [8 x i32], - ptr addrspace(1) %outg, ptr addrspace(1) %outh, [8 x i32], - ptr addrspace(1) %outi, ptr addrspace(1) %outj, [8 x i32], - ptr addrspace(1) %outk, ptr addrspace(1) %outl, [8 x i32], - ptr addrspace(1) %outm, ptr addrspace(1) %outn, [8 x i32], i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32], i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32], i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32], - i32 %inc, i32 %ind, i32 %ine, i32 %inf, i32 %ing, i32 %inh, - i32 %ini, i32 %inj, i32 %ink) #0 { + i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 { entry: - store volatile i32 %in0, ptr addrspace(1) %out0 - store volatile i32 %in1, ptr addrspace(1) %out1 - store volatile i32 %in2, ptr addrspace(1) %out2 - store volatile i32 %in3, ptr addrspace(1) %out3 - store volatile i32 %in4, ptr addrspace(1) %out4 - store volatile i32 %in5, ptr addrspace(1) %out5 - store volatile i32 %in6, ptr addrspace(1) %out6 - store volatile i32 %in7, ptr addrspace(1) %out7 - store volatile i32 %in8, ptr addrspace(1) %out8 - store volatile i32 %in9, ptr addrspace(1) %out9 - store volatile i32 %ina, ptr addrspace(1) %outa - store volatile i32 %inb, ptr addrspace(1) %outb - store volatile i32 %inc, ptr addrspace(1) %outc - store volatile i32 %ind, ptr addrspace(1) %outd - store volatile i32 %ine, ptr addrspace(1) %oute - store volatile i32 %inf, ptr addrspace(1) %outf - store volatile i32 %ing, ptr addrspace(1) %outg - store volatile i32 %inh, ptr addrspace(1) %outh - store volatile i32 %ini, ptr addrspace(1) %outi - store volatile i32 %inj, ptr addrspace(1) %outj - store volatile i32 %ink, ptr addrspace(1) %outk + store i32 %in0, ptr addrspace(1) %out0 + store i32 %in1, ptr addrspace(1) %out1 + store i32 %in2, ptr addrspace(1) %out2 + store i32 %in3, ptr addrspace(1) %out3 + store i32 %in4, ptr addrspace(1) %out4 + store i32 %in5, ptr addrspace(1) %out5 + store i32 %in6, ptr addrspace(1) %out6 + store i32 %in7, ptr addrspace(1) %out7 + store i32 %in8, ptr addrspace(1) %out8 + store i32 %in9, ptr addrspace(1) %out9 + store i32 %ina, ptr addrspace(1) %outa + store i32 %inb, ptr addrspace(1) %outb + store i32 %inc, ptr addrspace(1) %outc + store i32 %ind, ptr addrspace(1) %outd + store i32 %ine, ptr addrspace(1) %oute + store i32 %inf, ptr addrspace(1) %outf ret void } @@ -170,7 +160,7 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 { ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 1 -attributes #0 = { "amdgpu-num-sgpr"="20" } +attributes #0 = { "amdgpu-num-sgpr"="14" } attributes #1 = { "amdgpu-num-vgpr"="20" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll index 0db5f01fc0ccc9..689619227b8d70 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor %s -o %t.bc -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll index 6eece2c9bf4166..9854977c2f308b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll @@ -1,12 +1,10 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes=amdgpu-attributor -o %t.gfx7.bc %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor -o %t.gfx8.bc %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor -o %t.gfx9.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj < %t.gfx7.bc | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj < %t.gfx8.bc | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %t.gfx9.bc | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %t.gfx7.bc | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %t.gfx8.bc | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %t.gfx9.bc | FileCheck --check-prefixes=CHECK,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX9 %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=CHECK,GFX9 %s ; On gfx8, the queue ptr is required for this addrspacecast. diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll index acf829c4d3c720..cf26a427aec324 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll @@ -1,6 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj < %t.bc | llvm-readelf --notes - | FileCheck %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %t.bc | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=CHECK %s declare void @function1() diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll index 03242b69beb8c3..7986368e2a3584 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll @@ -31,14 +31,14 @@ ; GFX10: .sgpr_spill_count: 0 ; GFX10: .vgpr_count: 4 ; GFX10: .vgpr_spill_count: 0 -define amdgpu_kernel void @test1(ptr %x) #1 { +define amdgpu_kernel void @test1(ptr %x) { %1 = load volatile float, ptr %x %2 = call float @f(float %1) store volatile float %2, ptr %x ret void } -define internal float @f(float %arg0) #1 { +define internal float @f(float %arg0) #0 { %stack = alloca float, i32 4, align 4, addrspace(5) store volatile float 3.0, ptr addrspace(5) %stack %val = load volatile float, ptr addrspace(5) %stack @@ -135,7 +135,6 @@ define amdgpu_kernel void @test4() { } attributes #0 = { norecurse } -attributes #1 = { norecurse "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 487e62b6c3495e..de484677bf5e6b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -106,7 +106,7 @@ ; HSA: .Lfunc_end0: ; HSA: .size simple, .Lfunc_end0-simple -define amdgpu_kernel void @simple(ptr addrspace(1) %out) #0 { +define amdgpu_kernel void @simple(ptr addrspace(1) %out) { entry: store i32 0, ptr addrspace(1) %out ret void @@ -114,13 +114,11 @@ entry: ; HSA-LABEL: {{^}}simple_no_kernargs: ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @simple_no_kernargs() #0 { +define amdgpu_kernel void @simple_no_kernargs() { entry: store volatile i32 0, ptr addrspace(1) undef ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 78653d7e21ad81..7ee31bf4dce7cd 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -6,9 +6,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -52,10 +52,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; ; GFX10-LABEL: udiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s2, 0, s6 @@ -101,8 +100,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -172,9 +171,9 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -216,10 +215,9 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; ; GFX10-LABEL: urem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX10-NEXT: s_sub_i32 s2, 0, s6 @@ -263,8 +261,8 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -333,14 +331,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_ashr_i32 s3, s5, 31 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 +; GFX9-NEXT: s_abs_i32 s2, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s5, 0, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -351,70 +349,70 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s5 -; GFX9-NEXT: s_mul_i32 s7, s6, s4 -; GFX9-NEXT: s_sub_i32 s7, s2, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5 +; GFX9-NEXT: s_mul_i32 s7, s6, s2 +; GFX9-NEXT: s_sub_i32 s7, s3, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s4 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_sub_i32 s9, s7, s2 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_cselect_b32 s7, s9, s7 ; GFX9-NEXT: s_add_i32 s8, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_cmp_ge_u32 s7, s2 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s3 -; GFX9-NEXT: s_sub_i32 s6, s6, s3 -; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_sub_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s4, s5 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s3, 0, s4 +; GFX10-NEXT: s_abs_i32 s2, s3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s4, 0, s2 +; GFX10-NEXT: s_ashr_i32 s3, s3, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s6, v0 +; GFX10-NEXT: v_readfirstlane_b32 s5, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s3, s3, s6 -; GFX10-NEXT: s_mul_hi_u32 s5, s6, s3 -; GFX10-NEXT: s_mov_b32 s3, 0 -; GFX10-NEXT: s_add_i32 s5, s6, s5 +; GFX10-NEXT: s_mul_i32 s4, s4, s5 +; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_add_i32 s5, s5, s6 ; GFX10-NEXT: .LBB2_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s6, s3, s5 -; GFX10-NEXT: s_mul_i32 s7, s6, s4 +; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX10-NEXT: s_mul_i32 s7, s6, s2 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_sub_i32 s7, s3, s7 -; GFX10-NEXT: s_sub_i32 s9, s7, s4 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_sub_i32 s7, s4, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s2 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_cmp_ge_u32 s7, s2 ; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_add_i32 s3, s3, 1 -; GFX10-NEXT: s_xor_b32 s6, s6, s2 -; GFX10-NEXT: s_sub_i32 s6, s6, s2 +; GFX10-NEXT: s_add_i32 s4, s4, 1 +; GFX10-NEXT: s_xor_b32 s6, s6, s3 +; GFX10-NEXT: s_sub_i32 s6, s6, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -422,51 +420,51 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv32_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s4, s5 -; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s3, 0, s4 +; GFX11-NEXT: s_abs_i32 s2, s3 +; GFX11-NEXT: s_ashr_i32 s3, s3, 31 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s4, 0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s6, v0 +; GFX11-NEXT: v_readfirstlane_b32 s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s3, s3, s6 +; GFX11-NEXT: s_mul_i32 s4, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s6, s3 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_add_i32 s5, s6, s5 +; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_add_i32 s5, s5, s6 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s6, s3, s5 -; GFX11-NEXT: s_mul_i32 s7, s6, s4 +; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX11-NEXT: s_mul_i32 s7, s6, s2 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_sub_i32 s7, s3, s7 +; GFX11-NEXT: s_sub_i32 s7, s4, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s9, s7, s4 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 +; GFX11-NEXT: s_sub_i32 s9, s7, s2 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_cselect_b32 s7, s9, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_cmp_ge_u32 s7, s4 +; GFX11-NEXT: s_cmp_ge_u32 s7, s2 ; GFX11-NEXT: s_cselect_b32 s6, s8, s6 -; GFX11-NEXT: s_add_i32 s3, s3, 1 -; GFX11-NEXT: s_xor_b32 s6, s6, s2 +; GFX11-NEXT: s_add_i32 s4, s4, 1 +; GFX11-NEXT: s_xor_b32 s6, s6, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s6, s2 +; GFX11-NEXT: s_sub_i32 s6, s6, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s6 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -492,126 +490,125 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_sub_i32 s3, 0, s4 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s3, s3, s5 -; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3 -; GFX9-NEXT: s_add_i32 s3, s5, s3 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s4, s5, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: s_sub_i32 s5, s2, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX9-NEXT: s_mul_i32 s5, s5, s2 +; GFX9-NEXT: s_sub_i32 s5, s3, s5 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_sub_i32 s6, s5, s4 -; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: srem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_abs_i32 s4, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: s_sub_i32 s2, 0, s4 +; GFX10-NEXT: s_abs_i32 s2, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v0 +; GFX10-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s3 -; GFX10-NEXT: s_mul_hi_u32 s5, s3, s2 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_add_i32 s3, s3, s5 +; GFX10-NEXT: s_mul_i32 s3, s3, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s4, s4, s5 ; GFX10-NEXT: .LBB3_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX10-NEXT: s_mul_i32 s5, s5, s4 -; GFX10-NEXT: s_sub_i32 s5, s2, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s4 -; GFX10-NEXT: s_cmp_ge_u32 s5, s4 +; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX10-NEXT: s_mul_i32 s5, s5, s2 +; GFX10-NEXT: s_sub_i32 s5, s3, s5 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s5, s4 -; GFX10-NEXT: s_cmp_ge_u32 s5, s4 +; GFX10-NEXT: s_sub_i32 s6, s5, s2 +; GFX10-NEXT: s_cmp_ge_u32 s5, s2 ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: s_add_i32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: srem32_invariant_denom: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_abs_i32 s4, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX11-NEXT: s_sub_i32 s2, 0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_abs_i32 s2, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s3, 0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s2, s2, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s3, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_add_i32 s3, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s3, s3, s4 +; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s4, s4, s5 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB3_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s2, s3 -; GFX11-NEXT: s_mul_i32 s5, s5, s4 +; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4 +; GFX11-NEXT: s_mul_i32 s5, s5, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s5, s2, s5 -; GFX11-NEXT: s_sub_i32 s6, s5, s4 -; GFX11-NEXT: s_cmp_ge_u32 s5, s4 +; GFX11-NEXT: s_sub_i32 s5, s3, s5 +; GFX11-NEXT: s_sub_i32 s6, s5, s2 +; GFX11-NEXT: s_cmp_ge_u32 s5, s2 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s6, s5, s4 -; GFX11-NEXT: s_cmp_ge_u32 s5, s4 +; GFX11-NEXT: s_sub_i32 s6, s5, s2 +; GFX11-NEXT: s_cmp_ge_u32 s5, s2 ; GFX11-NEXT: s_cselect_b32 s5, s6, s5 -; GFX11-NEXT: s_add_i32 s2, s2, 1 +; GFX11-NEXT: s_add_i32 s3, s3, 1 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -637,14 +634,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -658,6 +655,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v5, v4, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -666,12 +664,12 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: udiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 @@ -687,7 +685,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v6, s0 -; GFX10-NEXT: global_store_short v5, v4, s[4:5] +; GFX10-NEXT: global_store_short v5, v4, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -695,11 +693,11 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: udiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_and_b32 s0, s4, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -748,14 +746,14 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s0, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -767,11 +765,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 ; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 ; GFX9-NEXT: global_store_short v5, v4, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -780,13 +777,13 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: urem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -798,10 +795,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0 ; GFX10-NEXT: v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: global_store_short v5, v4, s[0:1] +; GFX10-NEXT: global_store_short v5, v4, s[2:3] ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 @@ -810,11 +807,11 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: urem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -865,19 +862,19 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: sdiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 ; GFX9-NEXT: s_ashr_i32 s5, s6, 30 ; GFX9-NEXT: s_or_b32 s5, s5, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -886,15 +883,14 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 +; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 ; GFX9-NEXT: s_cselect_b32 s5, s5, 0 -; GFX9-NEXT: s_and_b32 s6, 0xffff, s3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 ; GFX9-NEXT: v_add_u32_e32 v2, s5, v4 ; GFX9-NEXT: s_lshl_b32 s5, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -903,19 +899,19 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: sdiv16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX10-NEXT: s_xor_b32 s5, s4, s2 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 ; GFX10-NEXT: s_ashr_i32 s4, s5, 30 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -926,12 +922,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo ; GFX10-NEXT: s_cselect_b32 s4, s4, 0 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_store_short v3, v2, s[0:1] +; GFX10-NEXT: global_store_short v3, v2, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -939,11 +935,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: sdiv16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 @@ -999,19 +995,19 @@ bb3: ; preds = %bb3, %bb define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) { ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0 -; GFX9-NEXT: s_movk_i32 s2, 0x400 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX9-NEXT: s_xor_b32 s6, s5, s4 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 ; GFX9-NEXT: s_ashr_i32 s6, s6, 30 ; GFX9-NEXT: s_or_b32 s8, s6, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -1020,18 +1016,16 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 +; GFX9-NEXT: v_add_u16_e64 v2, s4, 1 ; GFX9-NEXT: s_cselect_b32 s6, s8, 0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s2, v2 -; GFX9-NEXT: s_and_b32 s7, 0xffff, s3 -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v2 ; GFX9-NEXT: v_add_u32_e32 v2, s6, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 ; GFX9-NEXT: s_lshl_b32 s6, s7, 1 -; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -1040,19 +1034,19 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-LABEL: srem16_invariant_denom: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX10-NEXT: s_xor_b32 s5, s4, s2 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 ; GFX10-NEXT: s_ashr_i32 s5, s5, 30 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 @@ -1065,13 +1059,13 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_cselect_b32 s5, s5, 0 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s3 -; GFX10-NEXT: v_readfirstlane_b32 s3, v2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, s0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 -; GFX10-NEXT: global_store_short v2, v3, s[0:1] +; GFX10-NEXT: global_store_short v2, v3, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -1079,11 +1073,11 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-LABEL: srem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 011a366267afe1..9da07ea04ded59 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -12,8 +12,8 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -40,8 +40,8 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -69,49 +69,48 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -150,8 +149,8 @@ entry: define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MulMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -179,8 +178,8 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MulMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -207,46 +206,45 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MulMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -290,8 +288,8 @@ entry: define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -318,8 +316,8 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -347,49 +345,48 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -425,8 +422,8 @@ entry: define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedTypedMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -453,8 +450,8 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MixedTypedMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -482,46 +479,45 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MixedTypedMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -566,8 +562,8 @@ entry: define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_alt_AddOperands: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -594,8 +590,8 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_alt_AddOperands: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -623,13 +619,13 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_alt_AddOperands: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -638,38 +634,37 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -705,8 +700,8 @@ entry: define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MixedExt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -733,8 +728,8 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MixedExt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -762,46 +757,45 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MixedExt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -846,8 +840,8 @@ entry: define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_SameVec: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -872,8 +866,8 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_SameVec: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -899,48 +893,47 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_SameVec: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v2, s2, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -986,8 +979,8 @@ entry: define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1015,8 +1008,8 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_v4i16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1044,49 +1037,48 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_v4i16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -1122,8 +1114,8 @@ entry: define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_v4i16_Hi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -1150,8 +1142,8 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_v4i16_Hi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1183,49 +1175,48 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_v4i16_Hi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -1261,8 +1252,8 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Even: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1290,8 +1281,8 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_v4i16_Even: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1319,46 +1310,45 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_v4i16_Even: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -1403,8 +1393,8 @@ entry: define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_v4i16_Middle: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1432,8 +1422,8 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_v4i16_Middle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1461,46 +1451,45 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_v4i16_Middle: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-NODL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-NODL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v0, v1, s2, v0 -; GFX9-DL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 +; GFX9-DL-NEXT: global_store_dword v4, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -1545,8 +1534,8 @@ entry: define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; GFX7-LABEL: notudot2_DiffIndex: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1573,8 +1562,8 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notudot2_DiffIndex: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1602,46 +1591,45 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notudot2_DiffIndex: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1686,8 +1674,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1715,8 +1703,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1745,50 +1733,49 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1838,8 +1825,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1867,8 +1854,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1897,50 +1884,49 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1990,8 +1976,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2019,8 +2005,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2049,13 +2035,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2064,20 +2050,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 @@ -2086,17 +2072,16 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2148,8 +2133,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2177,8 +2162,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2207,13 +2192,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2222,20 +2207,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 @@ -2244,17 +2229,16 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2306,8 +2290,8 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2335,8 +2319,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2365,13 +2349,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2379,20 +2363,20 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2400,17 +2384,16 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2462,8 +2445,8 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2491,8 +2474,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot2_MultipleUses_mul2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2521,13 +2504,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2535,20 +2518,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2556,17 +2539,16 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2618,8 +2600,8 @@ entry: define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2646,8 +2628,8 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2673,14 +2655,14 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2688,19 +2670,19 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-NODL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2708,21 +2690,21 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -2730,7 +2712,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2759,8 +2741,8 @@ entry: define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX7-LABEL: notsdot2_sext8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2787,8 +2769,8 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notsdot2_sext8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2818,13 +2800,13 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notsdot2_sext8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2832,36 +2814,35 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v3 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0001 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 1d68b0ba0a2800..fdd913867c8f89 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -44,8 +44,8 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -79,13 +79,13 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,32 +93,31 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -134,13 +133,10 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -197,8 +193,8 @@ entry: define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -239,8 +235,8 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -280,14 +276,14 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -309,49 +305,47 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_sshort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_sshort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_sshort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_sshort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3 -; GFX10-DL-NEXT: global_store_short v1, v4, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v4, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -408,8 +402,8 @@ entry: define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -442,8 +436,8 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -475,14 +469,14 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -496,49 +490,47 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -587,8 +579,8 @@ entry: define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -622,8 +614,8 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -658,13 +650,13 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -675,38 +667,37 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -725,12 +716,9 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -796,8 +784,8 @@ entry: define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -830,8 +818,8 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -867,13 +855,13 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 @@ -884,32 +872,31 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s2, v3 +; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s0, v3 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -925,13 +912,10 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -974,8 +958,8 @@ entry: define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1016,8 +1000,8 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1053,15 +1037,15 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NODL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1070,35 +1054,35 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s2 -; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s2 +; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1107,30 +1091,29 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1167,13 +1150,10 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1245,8 +1225,8 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1273,8 +1253,8 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1302,48 +1282,47 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1362,13 +1341,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1417,8 +1393,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1448,8 +1424,8 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1480,13 +1456,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -1495,37 +1471,36 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1544,13 +1519,10 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1606,8 +1578,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1637,8 +1609,8 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1669,13 +1641,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v3, 24, v1 @@ -1684,37 +1656,36 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1733,13 +1704,10 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -1794,8 +1762,8 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1826,8 +1794,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1857,8 +1825,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1873,13 +1841,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1887,15 +1855,14 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -1909,13 +1876,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1969,7 +1933,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2006,7 +1970,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2044,7 +2008,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2067,7 +2031,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2089,7 +2053,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2112,9 +2076,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2181,7 +2143,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2215,7 +2177,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2250,7 +2212,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -2272,7 +2234,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2296,7 +2258,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2320,9 +2282,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2383,44 +2343,44 @@ entry: define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s11 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0xf +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; GFX7-NEXT: s_sext_i32_i16 s0, s0 -; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_sext_i32_i16 s5, s12 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_i32_i24 v1, v3, s0, v1 +; GFX7-NEXT: v_mad_i32_i24 v1, v3, s5, v1 ; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2430,14 +2390,14 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_sext_i32_i16 s3, s8 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v2, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, s3, v1 +; GFX8-NEXT: v_mad_i32_i24 v1, v2, s2, v1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 @@ -2451,49 +2411,49 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_sext_i32_i16 s3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i16 s2, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s2, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_sext_i32_i16 s4, s8 +; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201 +; GFX9-DL-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s4, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s2, v3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm @@ -2501,24 +2461,24 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: idot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_sext_i32_i16 s3, s8 -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s3, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s2, s3 ; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2526,25 +2486,23 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: idot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c -; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c +; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_sext_i32_i16 s3, s8 -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s3, s2 +; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s2, s3 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -2596,8 +2554,8 @@ entry: define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2627,8 +2585,8 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2659,13 +2617,13 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 @@ -2674,37 +2632,36 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -2723,13 +2680,10 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -2789,7 +2743,7 @@ entry: define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -2822,7 +2776,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -2856,7 +2810,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] @@ -2878,7 +2832,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2902,7 +2856,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -2926,9 +2880,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -2991,25 +2943,25 @@ entry: define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s15, 0xf000 -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, s15 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 -; GFX7-NEXT: s_mov_b32 s14, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 @@ -3017,7 +2969,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v5, v3, 0, 8 ; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s0 +; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v2, v4, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8 @@ -3027,14 +2979,14 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_i32_i24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -3077,9 +3029,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3103,9 +3055,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -3132,9 +3084,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3159,11 +3111,9 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -3243,8 +3193,8 @@ entry: define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_nonstandard_signed: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3280,8 +3230,8 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_nonstandard_signed: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3317,10 +3267,10 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_nonstandard_signed: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3333,7 +3283,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3342,15 +3292,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_nonstandard_signed: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] @@ -3363,7 +3313,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 @@ -3372,16 +3322,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_nonstandard_signed: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3411,12 +3360,9 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_nonstandard_signed: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index fb94b504781b10..0b131ea74f1abb 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -44,8 +44,8 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -79,13 +79,13 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -93,38 +93,37 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -132,13 +131,10 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -196,8 +192,8 @@ entry: define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -230,8 +226,8 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -268,15 +264,15 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -287,56 +283,54 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc16: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -394,8 +388,8 @@ entry: define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -428,8 +422,8 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -461,14 +455,14 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -482,49 +476,47 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_acc8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -574,8 +566,8 @@ entry: define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -602,8 +594,8 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -629,14 +621,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -644,59 +636,57 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v2 -; GFX9-NODL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot2_8: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -740,8 +730,8 @@ entry: define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -774,8 +764,8 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -807,14 +797,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 @@ -828,49 +818,47 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationInsideMAD: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -920,8 +908,8 @@ entry: define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_CommutationAccrossMADs: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -954,8 +942,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -987,14 +975,14 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1008,49 +996,47 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v5, v4, v1 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v9, v8, v1 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX9-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v2, v4 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] @@ -1100,8 +1086,8 @@ entry: define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1135,8 +1121,8 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1171,13 +1157,13 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_multiuse_mul1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -1188,38 +1174,37 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1238,12 +1223,9 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_multiuse_mul1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1309,8 +1291,8 @@ entry: define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_multiuse_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1345,8 +1327,8 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_multiuse_add1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1382,13 +1364,13 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_multiuse_add1: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8 @@ -1398,38 +1380,37 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s2 -; GFX9-NODL-NEXT: v_add_u32_e32 v4, s2, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 +; GFX9-NODL-NEXT: v_add_u32_e32 v4, s0, v2 ; GFX9-NODL-NEXT: v_add3_u32 v2, v2, v3, v6 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v1, v4 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_add_i32 s3, s2, s2 +; GFX9-DL-NEXT: s_add_i32 s1, s0, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: v_add3_u32 v1, s3, v3, v1 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: v_add3_u32 v1, s1, v3, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1447,12 +1428,9 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_multiuse_add1: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -1520,8 +1498,8 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1556,8 +1534,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1594,15 +1572,15 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1613,27 +1591,27 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0302 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1644,25 +1622,25 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1678,16 +1656,14 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 @@ -1763,8 +1739,8 @@ entry: define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX7-LABEL: notdot4_mixedtypes2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -1801,8 +1777,8 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX8-LABEL: notdot4_mixedtypes2: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1841,15 +1817,15 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: notdot4_mixedtypes2: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -1861,7 +1837,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1869,20 +1845,20 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes2: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1894,7 +1870,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v8, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v5, v6, v3 @@ -1902,14 +1878,14 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v9, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes2: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1917,7 +1893,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1937,16 +1913,14 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 -; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: notdot4_mixedtypes2: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 @@ -2027,8 +2001,8 @@ entry: define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2061,8 +2035,8 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2096,13 +2070,13 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -2110,38 +2084,37 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -2149,13 +2122,10 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2199,8 +2169,8 @@ entry: define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2237,8 +2207,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2273,16 +2243,16 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc16_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff -; GFX9-NODL-NEXT: s_mov_b32 s3, 0x5040100 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2290,13 +2260,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s3 -; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s3 -; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s3 -; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s3 +; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -2304,21 +2274,21 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff -; GFX9-DL-NEXT: s_mov_b32 s3, 0x5040100 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -2326,13 +2296,13 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s3 -; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s3 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s3 -; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s3 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2340,15 +2310,14 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 @@ -2384,12 +2353,9 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc16_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2461,8 +2427,8 @@ entry: define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2495,8 +2461,8 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2532,14 +2498,14 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc8_vecMul: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) @@ -2555,19 +2521,19 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -2583,15 +2549,14 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2625,13 +2590,10 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc8_vecMul: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -2700,8 +2662,8 @@ entry: define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_2ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2728,8 +2690,8 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_2ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2757,48 +2719,47 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_2ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_2ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_2ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -2816,13 +2777,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_2ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -2870,8 +2828,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -2901,8 +2859,8 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2933,13 +2891,13 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -2948,37 +2906,36 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -2996,13 +2953,10 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -3057,8 +3011,8 @@ entry: define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_3ele_permuted: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3088,8 +3042,8 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_3ele_permuted: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3120,13 +3074,13 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v1 @@ -3135,37 +3089,36 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020003 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -3183,13 +3136,10 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -3245,8 +3195,8 @@ entry: define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_opt: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3277,8 +3227,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_opt: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3308,8 +3258,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_opt: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3324,13 +3274,13 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_opt: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3338,15 +3288,14 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_opt: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3359,13 +3308,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_opt: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -3419,7 +3365,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3456,7 +3402,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3494,7 +3440,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -3517,7 +3463,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3539,7 +3485,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -3561,9 +3507,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -3631,7 +3575,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -3665,7 +3609,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -3700,7 +3644,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] @@ -3722,7 +3666,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3746,7 +3690,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -3769,9 +3713,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -3834,44 +3776,44 @@ entry: define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_bad_source: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0xf -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b32 s14, 0 -; GFX7-NEXT: s_mov_b32 s15, s11 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0xf +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x11 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_and_b32 s5, s12, 0xffff +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, s0, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, s5, v1 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_bad_source: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -3881,14 +3823,14 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_and_b32 s3, s8, 0xffff +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, s3, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, v2, s2, v1 ; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 @@ -3902,49 +3844,49 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_bad_source: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_and_b32 s3, s8, 0xffff +; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s2, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_bad_source: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201 +; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s4, v3 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s2, v3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm @@ -3952,24 +3894,24 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-LABEL: udot4_bad_source: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_and_b32 s3, s8, 0xffff -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s3, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s2, s3 ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -3977,25 +3919,23 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1, ; GFX11-DL-LABEL: udot4_bad_source: ; GFX11-DL: ; %bb.0: ; %entry ; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b32 s8, s[2:3], 0x3c -; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c +; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] -; GFX11-DL-NEXT: s_and_b32 s3, s8, 0xffff -; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0 ; GFX11-DL-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 ; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s3, s2 +; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s2, s3 ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2 ; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -4047,8 +3987,8 @@ entry: define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_commutative: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4078,8 +4018,8 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_commutative: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4110,13 +4050,13 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_commutative: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 @@ -4125,37 +4065,36 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_commutative: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc020100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_commutative: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] @@ -4173,13 +4112,10 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_commutative: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] @@ -4239,7 +4175,7 @@ entry: define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_3src_3ele_src0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b32 s14, 0 ; GFX7-NEXT: s_mov_b32 s15, s11 @@ -4272,7 +4208,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_3src_3ele_src0: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -4306,7 +4242,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3] @@ -4328,7 +4264,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -4352,7 +4288,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x2 @@ -4375,9 +4311,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x2 @@ -4440,25 +4374,25 @@ entry: define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_4src: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; GFX7-NEXT: s_mov_b32 s15, 0xf000 -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, s15 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s14, 0 +; GFX7-NEXT: s_mov_b32 s15, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[6:7] -; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x11 -; GFX7-NEXT: s_mov_b32 s14, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[6:7] +; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[12:13], s[10:11] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 @@ -4466,7 +4400,7 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s0 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v2, s4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 8, 8 @@ -4476,14 +4410,14 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v2, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_4src: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -4526,9 +4460,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_4src: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] @@ -4552,9 +4486,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: udot4_4src: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 ; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -4581,9 +4515,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: udot4_4src: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x3 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -4607,11 +4541,9 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_4src: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x3 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] @@ -4691,8 +4623,8 @@ entry: define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32_multi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4734,8 +4666,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot4_acc32_multi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -4777,13 +4709,13 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: udot4_acc32_multi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 @@ -4795,45 +4727,44 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s2, v6 +; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s0, v6 ; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v7, v9 ; GFX9-NODL-NEXT: v_add3_u32 v0, v5, v3, v0 ; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v8, v1 -; GFX9-NODL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_multi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x6040200 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x2000200 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x6040200 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x2000200 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v2, s[6:7] -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s4, 0x7050301 ; GFX9-DL-NEXT: s_mov_b32 s6, 0x3010301 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v1, v0, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s3 +; GFX9-DL-NEXT: v_perm_b32 v5, v3, v3, s1 ; GFX9-DL-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v4, v5, s5 ; GFX9-DL-NEXT: v_perm_b32 v3, v3, v3, s6 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v3, v1 -; GFX9-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_multi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] @@ -4854,12 +4785,9 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: udot4_acc32_multi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[4:5] @@ -4954,8 +4882,8 @@ entry: define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hilo: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -4986,8 +4914,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hilo: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5019,8 +4947,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hilo: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5035,13 +4963,13 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hilo: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5049,15 +4977,14 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hilo: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5070,13 +4997,10 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_hilo: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] offset:4 @@ -5131,8 +5055,8 @@ entry: define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_lohi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5163,8 +5087,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_lohi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5196,8 +5120,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_lohi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] @@ -5212,34 +5136,33 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_lohi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x10302 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_lohi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 @@ -5255,13 +5178,10 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_lohi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 @@ -5320,8 +5240,8 @@ entry: define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_hihi: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 @@ -5352,8 +5272,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_hihi: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -5387,8 +5307,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_hihi: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 @@ -5403,34 +5323,33 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_hihi: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x1030200 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3010002 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_hihi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 @@ -5446,13 +5365,10 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_hihi: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 @@ -5511,16 +5427,16 @@ entry: define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v8i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, s7 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 @@ -5534,17 +5450,17 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc32_v8i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -5558,19 +5474,19 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc32_v8i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 @@ -5584,12 +5500,12 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX9-DL-LABEL: idot4_acc32_v8i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 ; GFX9-DL-NEXT: global_store_dword v2, v0, s[4:5] @@ -5597,30 +5513,28 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, ; ; GFX10-DL-LABEL: idot4_acc32_v8i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; ; GFX11-DL-LABEL: idot4_acc32_v8i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-DL-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 -; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-DL-NEXT: s_nop 0 ; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DL-NEXT: s_endpgm @@ -5667,8 +5581,8 @@ entry: define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v16i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5702,8 +5616,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v16i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5734,8 +5648,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v16i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NODL-NEXT: ; kill: killed $vgpr5 @@ -5754,42 +5668,41 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0 -; GFX9-NODL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v16i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x7050002 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] ; GFX9-DL-NEXT: global_load_dword v0, v5, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s3, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX9-DL-NEXT: ; kill: killed $vgpr5 ; GFX9-DL-NEXT: ; kill: killed $vgpr4 -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s2 +; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s3 +; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0 -; GFX9-DL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v16i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; kill: killed $vgpr5 ; GFX10-DL-NEXT: ; kill: killed $vgpr4 +; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] ; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7] @@ -5804,13 +5717,10 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_v16i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5] ; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7] @@ -5869,8 +5779,8 @@ entry: define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_v256i8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -5903,8 +5813,8 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_v256i8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xfc @@ -5938,8 +5848,8 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_v256i8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) @@ -5955,36 +5865,35 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v4, v5 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v6, v2 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_v256i8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x3020001 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: global_load_dword v3, v1, s[4:5] offset:252 -; GFX9-DL-NEXT: s_mov_b32 s3, 0x1000302 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x1000302 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v2, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 ; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_v256i8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252 @@ -5999,13 +5908,10 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_v256i8: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7] ; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252 @@ -6063,8 +5969,8 @@ entry: define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32_anyext: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -6090,8 +5996,8 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot4_acc32_anyext: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -6118,49 +6024,48 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX9-NODL-LABEL: idot4_acc32_anyext: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s2, v1 -; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_anyext: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0500 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0500 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0100 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s3 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v1, s1 ; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_anyext: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -6177,13 +6082,10 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1, ; ; GFX11-DL-LABEL: idot4_acc32_anyext: ; GFX11-DL: ; %bb.0: ; %entry -; GFX11-DL-NEXT: s_clause 0x1 -; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-NEXT: s_clause 0x1 ; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 99bb4d50b03d4c..8c53d2671de3f6 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -12,13 +12,13 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -63,11 +63,11 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -78,10 +78,10 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -116,20 +116,20 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -154,55 +154,54 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v2, v3, s2, v4 +; GFX9-NEXT: v_add3_u32 v2, v3, s0, v4 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v11, v12 ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v13, v14 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v9, v15, v16 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v8 ; GFX9-NEXT: v_add3_u32 v1, v2, v9, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] @@ -210,17 +209,16 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -307,13 +305,13 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -374,11 +372,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -391,11 +389,11 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -454,21 +452,21 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -522,26 +520,26 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -595,21 +593,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -679,16 +676,16 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-LABEL: idot8_acc16: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -829,13 +826,13 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -896,11 +893,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -913,11 +910,11 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 @@ -976,21 +973,21 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1044,26 +1041,26 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1117,21 +1114,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v10, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1201,16 +1197,16 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NOXNACK-LABEL: idot8_acc8: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1352,13 +1348,13 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,11 +1401,11 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1420,10 +1416,10 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4 ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 @@ -1460,20 +1456,20 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1494,7 +1490,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1506,25 +1502,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -1545,7 +1541,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, v7, v8 ; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v2 @@ -1557,21 +1553,20 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v7, v8 ; GFX9-DL-NEXT: v_add3_u32 v3, v3, v9, v10 ; GFX9-DL-NEXT: v_add3_u32 v1, v3, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1615,16 +1610,15 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -1743,13 +1737,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,11 +1788,11 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1809,10 +1803,10 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4 @@ -1847,20 +1841,20 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 @@ -1884,7 +1878,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_i32_i24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -1892,48 +1886,47 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2 ; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1] @@ -1941,17 +1934,16 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2002,13 +1994,13 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2069,11 +2061,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2086,11 +2078,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 @@ -2149,22 +2141,22 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 @@ -2199,9 +2191,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2 -; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2 -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s0 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 @@ -2213,11 +2205,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v17, v16, s2 -; GFX9-NEXT: v_perm_b32 v9, v10, v9, s2 -; GFX9-NEXT: v_perm_b32 v10, v15, v14, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-NEXT: v_perm_b32 v10, v15, v14, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2230,27 +2222,27 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 @@ -2285,9 +2277,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2 -; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2 -; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 @@ -2299,11 +2291,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2316,21 +2308,20 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2415,17 +2406,16 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2547,13 +2537,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2614,11 +2604,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_mov_b32_e32 v5, 12 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2631,11 +2621,11 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 @@ -2714,21 +2704,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2801,26 +2791,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 @@ -2893,22 +2883,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-XNACK: ; %bb.0: ; %entry -; GFX10-DL-XNACK-NEXT: s_clause 0x1 -; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-XNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-XNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-XNACK-NEXT: s_clause 0x1 ; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5] @@ -2999,17 +2988,16 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul: ; GFX10-DL-NOXNACK: ; %bb.0: ; %entry -; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NOXNACK-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NOXNACK-NEXT: s_clause 0x1 ; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 779107cc40e1fb..3828fa557731e8 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -10,13 +10,13 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -61,11 +61,11 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -76,10 +76,10 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -114,20 +114,20 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 @@ -151,7 +151,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -159,48 +159,47 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -282,13 +281,13 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -333,11 +332,11 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -350,10 +349,10 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -386,20 +385,20 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -427,25 +426,25 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -473,27 +472,27 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -521,7 +520,7 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -600,13 +599,13 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -651,11 +650,11 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -668,10 +667,10 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -704,20 +703,20 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -745,25 +744,25 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 @@ -791,27 +790,27 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -839,7 +838,7 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -918,13 +917,13 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -970,11 +969,11 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -987,10 +986,10 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -1024,20 +1023,20 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1066,25 +1065,25 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1113,27 +1112,27 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1162,7 +1161,7 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1225,13 +1224,13 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1277,11 +1276,11 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1294,10 +1293,10 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -1331,20 +1330,20 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1373,25 +1372,25 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 @@ -1420,27 +1419,27 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[0:1] +; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) @@ -1469,7 +1468,7 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1530,13 +1529,13 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,11 +1582,11 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1598,10 +1597,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -1638,20 +1637,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1672,7 +1671,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1684,25 +1683,25 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 @@ -1723,7 +1722,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1 @@ -1735,21 +1734,20 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 ; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1867,13 +1865,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1918,11 +1916,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1933,10 +1931,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 ; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 @@ -1971,20 +1969,20 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 @@ -2008,7 +2006,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s2, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 @@ -2016,48 +2014,47 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] @@ -2104,13 +2101,13 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2155,11 +2152,11 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2172,10 +2169,10 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 @@ -2208,21 +2205,21 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2231,16 +2228,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2251,9 +2248,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2262,26 +2259,26 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2290,16 +2287,16 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2310,9 +2307,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2321,21 +2318,20 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2428,13 +2424,13 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2479,11 +2475,11 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2496,10 +2492,10 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 ; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 @@ -2552,20 +2548,20 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 @@ -2612,25 +2608,25 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-NEXT: global_store_byte v3, v0, s[0:1] +; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 @@ -2677,22 +2673,21 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1] +; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2791,13 +2786,13 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd ; GFX7-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s14, -1 ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_add_u32 s12, s12, s9 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2843,11 +2838,11 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2860,10 +2855,10 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s15, 0xe80000 -; GFX8-NEXT: s_add_u32 s12, s12, s9 -; GFX8-NEXT: s_addc_u32 s13, s13, 0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s3 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 @@ -2897,21 +2892,21 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NEXT: s_add_u32 s12, s12, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2920,16 +2915,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -2940,9 +2935,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2952,26 +2947,26 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 -; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s14, -1 -; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 @@ -2980,16 +2975,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s2 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s2 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s2 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 ; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 @@ -3000,9 +2995,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s2 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s2 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s2 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -3012,21 +3007,20 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s14, -1 -; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s12, s12, s9 -; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -3115,8 +3109,8 @@ entry: define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX7-LABEL: udot8_variant1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 @@ -3161,8 +3155,8 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; ; GFX8-LABEL: udot8_variant1: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -3208,13 +3202,13 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; ; GFX9-LABEL: udot8_variant1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v3, 15, v1 @@ -3239,7 +3233,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v6, v5 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, v8, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v3, s2, v1 +; GFX9-NEXT: v_add3_u32 v1, v3, s0, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, v10, v9 ; GFX9-NEXT: v_mul_u32_u24_e32 v7, v12, v11 ; GFX9-NEXT: v_add3_u32 v1, v1, v4, v5 @@ -3247,36 +3241,35 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr, ; GFX9-NEXT: v_mul_u32_u24_e32 v9, v16, v15 ; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7 ; GFX9-NEXT: v_add3_u32 v1, v1, v8, v9 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 -; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll index 0f40d010e2a3a9..f7a0e296fa1733 100644 --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_lo: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 5 @@ -17,7 +17,7 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_lo: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 5 @@ -34,7 +34,7 @@ entry: define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; SI-LABEL: i64_imm_inline_hi: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -45,7 +45,7 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; ; VI-LABEL: i64_imm_inline_hi: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x12345678 @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -72,7 +72,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -87,7 +87,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_neg_0.0_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -97,7 +97,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -111,7 +111,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -121,7 +121,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_imm_neg_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -145,7 +145,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_bfrev_b32_e32 v0, 1 @@ -159,7 +159,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -169,7 +169,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.5 @@ -183,7 +183,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -193,7 +193,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -0.5 @@ -207,7 +207,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1.0 @@ -231,7 +231,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -241,7 +241,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1.0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -265,7 +265,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2.0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -289,7 +289,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2.0 @@ -303,7 +303,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -313,7 +313,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 4.0 @@ -327,7 +327,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -337,7 +337,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -4.0 @@ -351,7 +351,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e22f983 @@ -361,7 +361,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0.15915494 @@ -375,7 +375,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -385,7 +385,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983 @@ -399,7 +399,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -409,7 +409,7 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x45800000 @@ -423,8 +423,8 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -434,8 +434,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_0.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -450,8 +450,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,8 +461,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -477,8 +477,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -488,8 +488,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -504,8 +504,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -515,8 +515,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -531,8 +531,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -542,8 +542,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_1.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -558,8 +558,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -569,8 +569,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -585,8 +585,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,8 +596,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_2.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -612,8 +612,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -623,8 +623,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % ; ; VI-LABEL: add_inline_imm_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -639,8 +639,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +650,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo ; ; VI-LABEL: add_inline_imm_neg_4.0_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -666,7 +666,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_inline_imm_0.5_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -684,7 +684,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -708,7 +708,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: commute_add_literal_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -726,7 +726,7 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -750,8 +750,8 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -761,8 +761,8 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) ; ; VI-LABEL: add_inline_imm_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -777,8 +777,8 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -788,8 +788,8 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) ; ; VI-LABEL: add_inline_imm_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,8 +804,8 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -815,8 +815,8 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -831,8 +831,8 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_1_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -843,8 +843,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float ; ; VI-LABEL: add_inline_imm_neg_1_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -862,8 +862,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_2_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -874,8 +874,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float ; ; VI-LABEL: add_inline_imm_neg_2_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -893,8 +893,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_neg_16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -905,8 +905,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa ; ; VI-LABEL: add_inline_imm_neg_16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -924,8 +924,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_63_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -935,8 +935,8 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_63_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -951,8 +951,8 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x) { ; SI-LABEL: add_inline_imm_64_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -962,8 +962,8 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: add_inline_imm_64_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -978,25 +978,23 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.0 @@ -1007,25 +1005,23 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.5 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0.5 @@ -1036,25 +1032,23 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -0.5 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -0.5 @@ -1065,25 +1059,23 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 1.0 @@ -1094,25 +1086,23 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -1.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -1.0 @@ -1123,25 +1113,23 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 2.0 @@ -1152,25 +1140,23 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -2.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -2.0 @@ -1181,25 +1167,23 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 4.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 4.0 @@ -1210,25 +1194,23 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_neg_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], -4.0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, -4.0 @@ -1239,27 +1221,25 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 0.15915494309189532 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x3fc45f306dc9c882 @@ -1270,29 +1250,27 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; SI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 ; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0xbfc45f306dc9c882 @@ -1303,25 +1281,23 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000001 @@ -1332,25 +1308,23 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 2 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000002 @@ -1361,25 +1335,23 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32] define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 16 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 16 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 16 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000010 @@ -1390,7 +1362,7 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_1_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, -1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1401,7 +1373,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, -1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1417,7 +1389,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_2_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -2 @@ -1428,7 +1400,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_2_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -2 @@ -1444,7 +1416,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_neg_16_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -16 @@ -1455,7 +1427,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: add_inline_imm_neg_16_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -16 @@ -1471,25 +1443,23 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_63_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 63 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 63 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_63_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 63 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x000000000000003F @@ -1500,25 +1470,23 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32], double %x) { ; SI-LABEL: add_inline_imm_64_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f64 v[0:1], s[0:1], 64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], 64 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: add_inline_imm_64_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[0:1], 64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000040 @@ -1529,7 +1497,7 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1540,7 +1508,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1555,7 +1523,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_neg_0.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1566,7 +1534,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_literal_imm_neg_0.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1581,7 +1549,7 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1592,7 +1560,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1607,7 +1575,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_0.5_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1618,7 +1586,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1633,7 +1601,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1644,7 +1612,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1659,7 +1627,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_1.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1670,7 +1638,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1685,7 +1653,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1696,7 +1664,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1711,7 +1679,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_2.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1722,7 +1690,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1737,7 +1705,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1748,7 +1716,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1763,7 +1731,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_4.0_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1774,7 +1742,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1789,7 +1757,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1800,7 +1768,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1815,7 +1783,7 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_inline_imm_m_inv_2pi_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1826,7 +1794,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 @@ -1841,7 +1809,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; SI-LABEL: store_literal_imm_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1852,7 +1820,7 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index f407a1c26dd3eb..dcc615232e56be 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_neg_0.0_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -20,7 +20,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_neg_0.0_i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -33,7 +33,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_neg_0.0_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -44,7 +44,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_neg_0.0_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -59,7 +59,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -69,7 +69,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -81,7 +81,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -91,7 +91,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -105,7 +105,7 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_imm_neg_0.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -115,7 +115,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_imm_neg_0.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_imm_neg_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] @@ -137,7 +137,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_imm_neg_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x8000 @@ -151,7 +151,7 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -161,7 +161,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -173,7 +173,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] @@ -183,7 +183,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3800 @@ -197,7 +197,7 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -207,7 +207,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -219,7 +219,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] @@ -229,7 +229,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb800 @@ -243,7 +243,7 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -253,7 +253,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -265,7 +265,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] @@ -275,7 +275,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 @@ -289,7 +289,7 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_1.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -299,7 +299,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_1.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -311,7 +311,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] @@ -321,7 +321,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xbc00 @@ -335,7 +335,7 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -345,7 +345,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -357,7 +357,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] @@ -367,7 +367,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4000 @@ -381,7 +381,7 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_2.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -391,7 +391,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_2.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -403,7 +403,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] @@ -413,7 +413,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc000 @@ -427,7 +427,7 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -437,7 +437,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -449,7 +449,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] @@ -459,7 +459,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -473,7 +473,7 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_4.0_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -483,7 +483,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_m_4.0_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -495,7 +495,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_m_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] @@ -505,7 +505,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_m_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xc400 @@ -519,7 +519,7 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -529,7 +529,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_inline_imm_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -541,7 +541,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_inline_imm_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] @@ -551,7 +551,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_inline_imm_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3118 @@ -565,7 +565,7 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) { define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -575,7 +575,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; GFX11-LABEL: store_inline_imm_m_inv_2pi_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -587,7 +587,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; VI-LABEL: store_inline_imm_m_inv_2pi_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] @@ -597,7 +597,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) ; ; SI-LABEL: store_inline_imm_m_inv_2pi_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0xb118 @@ -611,7 +611,7 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; GFX10-LABEL: store_literal_imm_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe] @@ -621,7 +621,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; GFX11-LABEL: store_literal_imm_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] @@ -633,7 +633,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; VI-LABEL: store_literal_imm_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] @@ -643,7 +643,7 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) { ; ; SI-LABEL: store_literal_imm_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x6c00 @@ -658,8 +658,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] @@ -670,12 +670,12 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x00,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -683,26 +683,25 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x00,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x00,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_0.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0.0 @@ -714,8 +713,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] @@ -726,12 +725,12 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe0,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -739,26 +738,25 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe0,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe0,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0.5 @@ -770,8 +768,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_0.5_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] @@ -782,12 +780,12 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_0.5_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe2,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -795,26 +793,25 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe2,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -0.5 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe2,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -0.5, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -0.5 @@ -826,8 +823,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] @@ -838,12 +835,12 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe4,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -851,26 +848,25 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe4,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe4,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 1.0 @@ -882,8 +878,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_1.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] @@ -894,12 +890,12 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_1.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe6,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -907,26 +903,25 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe6,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -1.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe6,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_1.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -1.0 @@ -938,8 +933,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] @@ -950,12 +945,12 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xe8,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -963,26 +958,25 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xe8,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xe8,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 2.0 @@ -994,8 +988,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_2.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] @@ -1006,12 +1000,12 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_2.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xea,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1019,26 +1013,25 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xea,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -2.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xea,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_2.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -2.0 @@ -1050,8 +1043,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX10-LABEL: add_inline_imm_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] @@ -1062,12 +1055,12 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX11-LABEL: add_inline_imm_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xec,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1075,26 +1068,25 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xec,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xec,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 4.0 @@ -1106,8 +1098,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX10-LABEL: add_inline_imm_neg_4.0_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] @@ -1118,12 +1110,12 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX11-LABEL: add_inline_imm_neg_4.0_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0xee,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1131,26 +1123,25 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0xee,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, -4.0 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0xee,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_neg_4.0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, -4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, -4.0 @@ -1161,7 +1152,7 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_inline_imm_0.5_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1179,7 +1170,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; GFX11-LABEL: commute_add_inline_imm_0.5_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1199,7 +1190,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; VI-LABEL: commute_add_inline_imm_0.5_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1217,7 +1208,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ; ; SI-LABEL: commute_add_inline_imm_0.5_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1243,7 +1234,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: commute_add_literal_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1261,7 +1252,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: commute_add_literal_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1281,7 +1272,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: commute_add_literal_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1299,7 +1290,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad ; ; SI-LABEL: commute_add_literal_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1326,8 +1317,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_1_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] @@ -1338,12 +1329,12 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_1_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x02,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1351,26 +1342,25 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x02,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 1 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x02,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x33800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0001 @@ -1382,8 +1372,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_2_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] @@ -1394,12 +1384,12 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_2_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x04,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1407,26 +1397,25 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x04,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 2 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x04,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x34000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0002 @@ -1438,8 +1427,8 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_16_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] @@ -1450,12 +1439,12 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_16_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x20,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1463,26 +1452,25 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x20,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 16 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x20,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x35800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0010 @@ -1493,7 +1481,7 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_1_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1511,7 +1499,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_1_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1531,7 +1519,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_1_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1549,7 +1537,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_1_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1574,7 +1562,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_2_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1592,7 +1580,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: add_inline_imm_neg_2_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1612,7 +1600,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: add_inline_imm_neg_2_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1630,7 +1618,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a ; ; SI-LABEL: add_inline_imm_neg_2_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1655,7 +1643,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX10-LABEL: add_inline_imm_neg_16_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe] ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe] @@ -1673,7 +1661,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: add_inline_imm_neg_16_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; encoding: [0x01,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] ; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1693,7 +1681,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; VI-LABEL: add_inline_imm_neg_16_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; encoding: [0x03,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] ; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] @@ -1711,7 +1699,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr ; ; SI-LABEL: add_inline_imm_neg_16_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1737,8 +1725,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_63_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] @@ -1749,12 +1737,12 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_63_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x7e,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1762,26 +1750,25 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x7e,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 63 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x7e,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_63_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x367c0000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH003F @@ -1793,8 +1780,8 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX10-LABEL: add_inline_imm_64_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 ; encoding: [0x83,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa] ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] @@ -1805,12 +1792,12 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX11-LABEL: add_inline_imm_64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 ; encoding: [0x01,0x01,0x00,0xf4,0x08,0x00,0x00,0xf8] -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; encoding: [0x01,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] -; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x04,0x80,0x01,0x00] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] @@ -1818,26 +1805,25 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x8 ; encoding: [0x03,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; encoding: [0x03,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] +; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11] ; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf] -; VI-NEXT: v_add_f16_e64 v0, s4, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x04,0x80,0x01,0x00] +; VI-NEXT: v_add_f16_e64 v0, s6, 64 ; encoding: [0x00,0x00,0x1f,0xd1,0x06,0x80,0x01,0x00] ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; ; SI-LABEL: add_inline_imm_64_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_add_f32_e32 v0, 0x36800000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm %y = fadd half %x, 0xH0040 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index 342d7b0237118d..ae51c3edf1c7e7 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -665,4 +665,4 @@ define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) { ret <2 x i16> %y } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index b89dbd42e0466f..72f10ea892e53f 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 @@ -33,8 +33,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -56,7 +56,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 @@ -111,8 +111,8 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -123,8 +123,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc -; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc +; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -135,7 +135,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 +; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 @@ -147,7 +147,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -165,8 +165,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -177,8 +177,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 -; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8 +; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -189,7 +189,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 +; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 @@ -201,7 +201,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[6:7], 0x4 +; GFX9V5-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 @@ -219,12 +219,12 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { define amdgpu_kernel void @llvm_trap() { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8V4-NEXT: s_trap 2 ; ; GFX8V5-LABEL: llvm_trap: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_trap 2 ; diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll index 7c8d89ef03b1b2..4c5c136f5333f3 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes=amdgpu-attributor < %s | llc | FileCheck %s +; RUN: llc < %s | FileCheck %s target triple = "amdgcn-amd-amdhsa" diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll index eb4cba35e9946e..47110d94918879 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -11,63 +11,59 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() { ; GFX9-LABEL: indirect_call_known_no_special_inputs: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: s_mov_b64 s[8:9], 0 -; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 ; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, wobble@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, wobble@gotpcrel32@hi+12 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, snork@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, snork@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[8:9], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, 1, s15 -; GFX9-NEXT: s_cmp_eq_u32 s8, 1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_cselect_b32 s17, s21, s19 -; GFX9-NEXT: s_cselect_b32 s16, s20, s18 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: s_cmp_eq_u32 s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_cselect_b32 s5, s13, s11 +; GFX9-NEXT: s_cselect_b32 s4, s12, s10 +; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: indirect_call_known_no_special_inputs: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_getpc_b64 s[6:7] -; GFX12-NEXT: s_sext_i32_i16 s7, s7 -; GFX12-NEXT: s_add_co_u32 s6, s6, snork@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork@gotpcrel32@hi+16 -; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], 0 -; GFX12-NEXT: s_getpc_b64 s[8:9] -; GFX12-NEXT: s_sext_i32_i16 s9, s9 -; GFX12-NEXT: s_add_co_u32 s8, s8, wobble@gotpcrel32@lo+8 -; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble@gotpcrel32@hi+16 -; GFX12-NEXT: s_load_u8 s12, s[4:5], 0x0 -; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 -; GFX12-NEXT: s_load_b64 s[6:7], s[8:9], 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, snork@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, snork@gotpcrel32@hi+16 +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, wobble@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, wobble@gotpcrel32@hi+16 +; GFX12-NEXT: s_load_u8 s6, s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: v_mov_b32_e32 v31, v0 +; GFX12-NEXT: s_mov_b64 s[8:9], 0 ; GFX12-NEXT: s_mov_b32 s32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_and_b32 s8, 1, s12 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s8, 1 -; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-NEXT: s_cselect_b32 s7, s7, s5 -; GFX12-NEXT: s_cselect_b32 s6, s6, s4 -; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-NEXT: s_and_b32 s4, 1, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s4, 1 +; GFX12-NEXT: s_cselect_b32 s1, s3, s1 +; GFX12-NEXT: s_cselect_b32 s0, s2, s0 +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index f54a511eff7f1d..8183106b0ce9d4 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -27,6 +27,7 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[DUMMYRETURNBLOCK:%.*]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br label %loop @@ -39,10 +40,10 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -66,6 +67,7 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond = icmp eq i32 %tmp, 1 @@ -82,7 +84,7 @@ return: define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 ; SI-NEXT: s_cbranch_scc1 .LBB2_4 ; SI-NEXT: ; %bb.1: @@ -129,6 +131,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br i1 undef, label %loop1, label %loop2 @@ -145,10 +148,10 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB3_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -189,6 +192,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 4d62d30a38ed34..76b007c22b699c 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,15 +8,15 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() @@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() @@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %11 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4 ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll index e7a7b8a335d0d3..4fecdb576a6de3 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -6,20 +6,17 @@ ; GCN: define amdgpu_kernel void @caller(ptr addrspace(1) nocapture %p) local_unnamed_addr #1 { ; GCN: %mul.i = fmul float %load, 1.500000e+01 -; UNSAFE: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } -; UNSAFE: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" } +; UNSAFE: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "unsafe-fp-math"="true" } +; UNSAFE: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="true" } -; NOINFS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-infs-fp-math"="true" "uniform-work-group-size"="false" } -; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } +; NOINFS: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-infs-fp-math"="true" } +; NOINFS: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "unsafe-fp-math"="false" } -; NONANS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-nans-fp-math"="true" "uniform-work-group-size"="false" } -; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="false" } - -declare void @extern() #0 +; NONANS: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-nans-fp-math"="true" } +; NONANS: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "unsafe-fp-math"="false" } define float @foo(float %x) #0 { entry: - call void @extern() %mul = fmul float %x, 1.500000e+01 ret float %mul } @@ -27,7 +24,7 @@ entry: define amdgpu_kernel void @caller(ptr addrspace(1) %p) #1 { entry: %load = load float, ptr addrspace(1) %p, align 4 - %call = call fast float @foo(float %load) + %call = call fast float @foo(float %load) #0 store float %call, ptr addrspace(1) %p, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll index 807a7d26f49e53..46b2eb30c791c7 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll @@ -10,7 +10,7 @@ entry: } ; GCN-LABEL: {{^}}inline_asm_input_v2f16: -; GCN: s_mov_b32 s2, s{{[0-9]+}} +; GCN: s_mov_b32 s0, s{{[0-9]+}} define amdgpu_kernel void @inline_asm_input_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 { entry: %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index b62bf890e65fe1..f736ca7cd625a3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -4,22 +4,22 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { ; GCN-LABEL: float4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) { ; GCN-LABEL: float4_inselt_undef: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -56,23 +56,23 @@ entry: define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) { ; GCN-LABEL: int4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s8, 3 -; GCN-NEXT: s_cselect_b32 s2, s7, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 -; GCN-NEXT: s_cselect_b32 s3, s6, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s3, s7, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s6, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: s_cselect_b32 s5, s5, 1 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -85,15 +85,15 @@ entry: define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc @@ -109,21 +109,21 @@ entry: define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) { ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GCN-NEXT: s_load_dword s12, s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 ; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: s_mov_b32 m0, s12 ; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 ; GCN-NEXT: v_mov_b32_e32 v8, s2 @@ -142,14 +142,14 @@ entry: define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) { ; GCN-LABEL: float16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -166,24 +166,24 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v14, s18 ; GCN-NEXT: v_mov_b32_e32 v15, s19 ; GCN-NEXT: s_mov_b32 m0, s20 -; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_add_u32 s0, s2, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s1 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -195,18 +195,18 @@ entry: define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { ; GCN-LABEL: float32_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x124 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v33, s3 +; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x70 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v33, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NEXT: v_mov_b32_e32 v4, s40 ; GCN-NEXT: v_mov_b32_e32 v5, s41 @@ -236,48 +236,48 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: v_mov_b32_e32 v32, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-NEXT: v_mov_b32_e32 v32, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s3 -; GCN-NEXT: v_mov_b32_e32 v28, s2 -; GCN-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: v_mov_b32_e32 v28, s0 +; GCN-NEXT: s_add_u32 s0, s2, 0x50 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s3 -; GCN-NEXT: v_mov_b32_e32 v24, s2 -; GCN-NEXT: s_add_u32 s2, s0, 64 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v25, s1 +; GCN-NEXT: v_mov_b32_e32 v24, s0 +; GCN-NEXT: s_add_u32 s0, s2, 64 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 -; GCN-NEXT: v_mov_b32_e32 v20, s2 -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v21, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s0 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 -; GCN-NEXT: v_mov_b32_e32 v16, s2 -; GCN-NEXT: s_add_u32 s2, s0, 32 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: s_add_u32 s0, s2, 32 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v13, s1 +; GCN-NEXT: v_mov_b32_e32 v12, s0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: @@ -289,8 +289,8 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -314,7 +314,7 @@ entry: define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -335,49 +335,49 @@ entry: define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) { ; GCN-LABEL: half8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 7 -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 +; GCN-NEXT: s_lshr_b32 s3, s6, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 +; GCN-NEXT: s_lshr_b32 s3, s5, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-NEXT: s_lshr_b32 s3, s4, 16 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -397,7 +397,7 @@ entry: define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 @@ -418,8 +418,8 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -443,8 +443,8 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s4, s4, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 @@ -467,99 +467,99 @@ entry: define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s7, 24 -; GCN-NEXT: s_cmp_lg_u32 s8, 15 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s3, s7, 24 +; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 14 +; GCN-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 14 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s7, 8 +; GCN-NEXT: s_lshr_b32 s3, s7, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 13 +; GCN-NEXT: s_cmp_lg_u32 s2, 13 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 12 +; GCN-NEXT: s_cmp_lg_u32 s2, 12 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s6, 24 +; GCN-NEXT: s_lshr_b32 s3, s6, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 11 +; GCN-NEXT: s_cmp_lg_u32 s2, 11 ; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 10 +; GCN-NEXT: s_lshr_b32 s3, s6, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 10 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s6, 8 +; GCN-NEXT: s_lshr_b32 s3, s6, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 9 +; GCN-NEXT: s_cmp_lg_u32 s2, 9 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 8 +; GCN-NEXT: s_cmp_lg_u32 s2, 8 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc -; GCN-NEXT: s_lshr_b32 s2, s5, 24 +; GCN-NEXT: s_lshr_b32 s3, s5, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 7 +; GCN-NEXT: s_cmp_lg_u32 s2, 7 ; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 6 +; GCN-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 6 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s5, 8 +; GCN-NEXT: s_lshr_b32 s3, s5, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 5 +; GCN-NEXT: s_cmp_lg_u32 s2, 5 ; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 4 +; GCN-NEXT: s_cmp_lg_u32 s2, 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_lshr_b32 s2, s4, 24 +; GCN-NEXT: s_lshr_b32 s3, s4, 24 ; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s8, 3 +; GCN-NEXT: s_cmp_lg_u32 s2, 3 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 16 -; GCN-NEXT: s_cmp_lg_u32 s8, 2 +; GCN-NEXT: s_lshr_b32 s3, s4, 16 +; GCN-NEXT: s_cmp_lg_u32 s2, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_lshr_b32 s2, s4, 8 +; GCN-NEXT: s_lshr_b32 s3, s4, 8 ; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc -; GCN-NEXT: s_cmp_lg_u32 s8, 1 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 ; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -580,21 +580,21 @@ entry: define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) { ; GCN-LABEL: double2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s8, s[2:3], 0x44 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s8, 1 -; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s7 -; GCN-NEXT: s_cselect_b32 s3, 0, s6 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 -; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7 +; GCN-NEXT: s_cselect_b32 s6, 0, s6 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5 ; GCN-NEXT: s_cselect_b32 s4, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -607,10 +607,10 @@ entry: define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s12, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x84 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 +; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 @@ -661,12 +661,12 @@ entry: define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) { ; GCN-LABEL: double8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s20, s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s20, 1 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 @@ -717,17 +717,17 @@ entry: define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) { ; GCN-LABEL: double7_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x94 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x84 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xa4 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x94 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x84 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_lshl_b32 s2, s2, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 @@ -738,25 +738,25 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NEXT: v_mov_b32_e32 v12, s16 ; GCN-NEXT: v_mov_b32_e32 v13, s17 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: s_add_u32 s0, s2, 16 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v15, s3 -; GCN-NEXT: v_mov_b32_e32 v14, s2 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v15, s1 +; GCN-NEXT: v_mov_b32_e32 v14, s0 ; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-NEXT: s_add_u32 s2, s0, 48 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s2, 48 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s0, s0, 32 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_add_u32 s0, s2, 32 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -770,15 +770,14 @@ entry: define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { ; GCN-LABEL: double16_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[2:3], 0x124 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xe4 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: s_lshl_b32 s0, s0, 1 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_lshl_b32 s2, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -810,7 +809,7 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NEXT: v_mov_b32_e32 v30, s18 ; GCN-NEXT: v_mov_b32_e32 v31, s19 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -868,22 +867,20 @@ entry: define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) { ; GCN-LABEL: double15_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0xa4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x114 -; GCN-NEXT: s_load_dwordx4 s[20:23], s[2:3], 0x104 -; GCN-NEXT: s_load_dwordx8 s[24:31], s[2:3], 0xe4 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xa4 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x114 +; GCN-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x104 +; GCN-NEXT: s_load_dwordx8 s[24:31], s[0:1], 0xe4 ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_load_dword s4, s[2:3], 0x124 -; GCN-NEXT: v_mov_b32_e32 v28, s0 -; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x124 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v28, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s4, 1 -; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_lshl_b32 s2, s4, 1 ; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s9 @@ -909,8 +906,9 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v25, s21 ; GCN-NEXT: v_mov_b32_e32 v26, s22 ; GCN-NEXT: v_mov_b32_e32 v27, s23 +; GCN-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 @@ -964,13 +962,13 @@ entry: define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: s_mov_b32 s15, 0xe80000 -; GCN-NEXT: s_add_u32 s12, s12, s9 -; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xe80000 +; GCN-NEXT: s_add_u32 s4, s4, s3 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 3 @@ -982,16 +980,16 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 -; GCN-NEXT: buffer_store_byte v4, off, s[12:15], 0 offset:3 -; GCN-NEXT: buffer_store_byte v3, off, s[12:15], 0 offset:2 -; GCN-NEXT: buffer_store_byte v2, off, s[12:15], 0 offset:1 +; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3 +; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: buffer_store_byte v1, v0, s[12:15], 0 offen -; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 -; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 -; GCN-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:3 +; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen +; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 +; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1019,11 +1017,11 @@ entry: define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit128_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x44 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s3, s4, 24 +; GCN-NEXT: s_lshr_b32 s1, s4, 24 ; GCN-NEXT: s_lshr_b32 s8, s4, 16 ; GCN-NEXT: s_lshr_b32 s9, s4, 17 ; GCN-NEXT: s_lshr_b32 s10, s4, 18 @@ -1059,10 +1057,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_lshr_b32 s41, s7, 21 ; GCN-NEXT: s_lshr_b32 s42, s7, 22 ; GCN-NEXT: s_lshr_b32 s43, s7, 23 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x77 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x77 ; GCN-NEXT: v_mov_b32_e32 v15, s43 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x76 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x76 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s42 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1070,11 +1068,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x75 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x75 ; GCN-NEXT: v_or_b32_e32 v15, v15, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s41 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x74 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x74 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s40 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1083,11 +1081,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x73 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x73 ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_mov_b32_e32 v18, s39 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x72 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x72 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s38 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1095,11 +1093,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x71 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x71 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s37 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x70 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x70 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s36 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1111,11 +1109,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7f ; GCN-NEXT: v_or_b32_e32 v15, v18, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s35 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1123,11 +1121,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1136,22 +1134,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s35 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x7a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x7a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s35 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x78 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x78 ; GCN-NEXT: v_mov_b32_e32 v13, s35 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x79 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x79 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s35 ; GCN-NEXT: v_cndmask_b32_e32 v13, 1, v13, vcc @@ -1166,11 +1164,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6f ; GCN-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v18, 15, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 14, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1178,11 +1176,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 13, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 12, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1191,11 +1189,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x6a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x6a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 10, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1203,11 +1201,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x69 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x69 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 9, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x68 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x68 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 8, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1219,11 +1217,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x67 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x67 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x66 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x66 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s7 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1231,11 +1229,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x65 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x65 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x64 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x64 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1244,11 +1242,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x63 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x63 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x62 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x62 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s7 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1256,11 +1254,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x61 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x61 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x60 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x60 ; GCN-NEXT: v_mov_b32_e32 v16, s7 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1273,11 +1271,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 4, v18 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 ; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x57 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x57 ; GCN-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v17, s34 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x56 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x56 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s33 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1285,11 +1283,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x55 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x55 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_mov_b32_e32 v18, s31 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x54 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x54 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s30 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1298,11 +1296,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x53 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x53 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_mov_b32_e32 v18, s29 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x52 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x52 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s28 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1310,11 +1308,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x51 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x51 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_mov_b32_e32 v19, s27 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x50 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x50 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_mov_b32_e32 v20, s26 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1326,11 +1324,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 4, v17 ; GCN-NEXT: v_and_b32_e32 v18, 15, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5f ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 7, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5e ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s25 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1338,11 +1336,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5d ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5c ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1351,22 +1349,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5b ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 3, s25 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x5a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x5a ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s25 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x58 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x58 ; GCN-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 3, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x59 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x59 ; GCN-NEXT: v_or_b32_e32 v19, v19, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s25 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc @@ -1380,11 +1378,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GCN-NEXT: v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4f +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4f ; GCN-NEXT: v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v3, 15, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4e +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4e ; GCN-NEXT: v_lshrrev_b16_e64 v18, 14, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1392,11 +1390,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4d +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4d ; GCN-NEXT: v_or_b32_e32 v3, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 13, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4c +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4c ; GCN-NEXT: v_lshrrev_b16_e64 v19, 12, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1405,11 +1403,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_and_b32_e32 v18, 3, v18 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4b +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4b ; GCN-NEXT: v_or_b32_e32 v3, v18, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 11, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x4a +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x4a ; GCN-NEXT: v_lshrrev_b16_e64 v19, 10, s6 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1417,11 +1415,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 3, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x49 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x49 ; GCN-NEXT: v_or_b32_e32 v18, v18, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 9, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x48 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x48 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 8, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1433,11 +1431,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v18, v19, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 12, v3 ; GCN-NEXT: v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x47 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x47 ; GCN-NEXT: v_or_b32_e32 v18, v3, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 7, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x46 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x46 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 6, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1445,11 +1443,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x45 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x45 ; GCN-NEXT: v_or_b32_e32 v3, v3, v19 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 5, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x44 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x44 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 4, s6 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1458,11 +1456,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: v_and_b32_e32 v19, 3, v19 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x43 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x43 ; GCN-NEXT: v_or_b32_e32 v19, v19, v3 ; GCN-NEXT: v_lshrrev_b16_e64 v3, 3, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x42 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x42 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v3, 1, v3, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1470,11 +1468,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v20, 1, v20 ; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 ; GCN-NEXT: v_lshlrev_b16_e32 v20, 2, v20 -; GCN-NEXT: s_cmpk_lg_i32 s2, 0x41 +; GCN-NEXT: s_cmpk_lg_i32 s0, 0x41 ; GCN-NEXT: v_or_b32_e32 v3, v3, v20 ; GCN-NEXT: v_lshrrev_b16_e64 v20, 1, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 64 +; GCN-NEXT: s_cmp_lg_u32 s0, 64 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v20, 1, v20, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1487,11 +1485,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v19 ; GCN-NEXT: v_and_b32_e32 v2, 15, v2 -; GCN-NEXT: s_cmp_lg_u32 s2, 55 +; GCN-NEXT: s_cmp_lg_u32 s0, 55 ; GCN-NEXT: v_or_b32_e32 v2, v2, v15 ; GCN-NEXT: v_mov_b32_e32 v15, s24 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 54 +; GCN-NEXT: s_cmp_lg_u32 s0, 54 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s23 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1499,12 +1497,12 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 53 +; GCN-NEXT: s_cmp_lg_u32 s0, 53 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s22 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 52 +; GCN-NEXT: s_cmp_lg_u32 s0, 52 ; GCN-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s21 @@ -1514,11 +1512,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 51 +; GCN-NEXT: s_cmp_lg_u32 s0, 51 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s20 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 50 +; GCN-NEXT: s_cmp_lg_u32 s0, 50 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1526,11 +1524,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 49 +; GCN-NEXT: s_cmp_lg_u32 s0, 49 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 48 +; GCN-NEXT: s_cmp_lg_u32 s0, 48 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v18, s17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1542,11 +1540,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 63 +; GCN-NEXT: s_cmp_lg_u32 s0, 63 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 62 +; GCN-NEXT: s_cmp_lg_u32 s0, 62 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s16 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1554,11 +1552,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 61 +; GCN-NEXT: s_cmp_lg_u32 s0, 61 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 60 +; GCN-NEXT: s_cmp_lg_u32 s0, 60 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1567,22 +1565,22 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 59 +; GCN-NEXT: s_cmp_lg_u32 s0, 59 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s16 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 58 +; GCN-NEXT: s_cmp_lg_u32 s0, 58 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 56 +; GCN-NEXT: s_cmp_lg_u32 s0, 56 ; GCN-NEXT: v_mov_b32_e32 v14, s16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 57 +; GCN-NEXT: s_cmp_lg_u32 s0, 57 ; GCN-NEXT: v_or_b32_e32 v17, v17, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 1, s16 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc @@ -1596,11 +1594,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 -; GCN-NEXT: s_cmp_lg_u32 s2, 47 +; GCN-NEXT: s_cmp_lg_u32 s0, 47 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v14, 15, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 46 +; GCN-NEXT: s_cmp_lg_u32 s0, 46 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 14, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1608,11 +1606,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 45 +; GCN-NEXT: s_cmp_lg_u32 s0, 45 ; GCN-NEXT: v_or_b32_e32 v14, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 13, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 44 +; GCN-NEXT: s_cmp_lg_u32 s0, 44 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 12, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1621,11 +1619,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 43 +; GCN-NEXT: s_cmp_lg_u32 s0, 43 ; GCN-NEXT: v_or_b32_e32 v14, v16, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v16, 11, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 42 +; GCN-NEXT: s_cmp_lg_u32 s0, 42 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 10, s5 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1633,11 +1631,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 41 +; GCN-NEXT: s_cmp_lg_u32 s0, 41 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 9, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 40 +; GCN-NEXT: s_cmp_lg_u32 s0, 40 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 8, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1649,11 +1647,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GCN-NEXT: v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 39 +; GCN-NEXT: s_cmp_lg_u32 s0, 39 ; GCN-NEXT: v_or_b32_e32 v16, v14, v16 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 7, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 38 +; GCN-NEXT: s_cmp_lg_u32 s0, 38 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1661,11 +1659,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 37 +; GCN-NEXT: s_cmp_lg_u32 s0, 37 ; GCN-NEXT: v_or_b32_e32 v14, v14, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 36 +; GCN-NEXT: s_cmp_lg_u32 s0, 36 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 4, s5 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1674,11 +1672,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 35 +; GCN-NEXT: s_cmp_lg_u32 s0, 35 ; GCN-NEXT: v_or_b32_e32 v17, v17, v14 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 3, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 34 +; GCN-NEXT: s_cmp_lg_u32 s0, 34 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 2, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1686,11 +1684,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_lshlrev_b16_e32 v14, 3, v14 ; GCN-NEXT: v_lshlrev_b16_e32 v18, 2, v18 -; GCN-NEXT: s_cmp_lg_u32 s2, 33 +; GCN-NEXT: s_cmp_lg_u32 s0, 33 ; GCN-NEXT: v_or_b32_e32 v18, v14, v18 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 1, s5 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 32 +; GCN-NEXT: s_cmp_lg_u32 s0, 32 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1704,11 +1702,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v1, 15, v1 ; GCN-NEXT: v_or_b32_e32 v1, v1, v17 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: s_cmp_lg_u32 s2, 23 +; GCN-NEXT: s_cmp_lg_u32 s0, 23 ; GCN-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v15, s15 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 22 +; GCN-NEXT: s_cmp_lg_u32 s0, 22 ; GCN-NEXT: v_cndmask_b32_e32 v15, 1, v15, vcc ; GCN-NEXT: v_mov_b32_e32 v16, s14 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1716,11 +1714,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v16, 1, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 3, v15 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 2, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 21 +; GCN-NEXT: s_cmp_lg_u32 s0, 21 ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_mov_b32_e32 v16, s13 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 20 +; GCN-NEXT: s_cmp_lg_u32 s0, 20 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s12 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1729,11 +1727,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_and_b32_e32 v16, 3, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 19 +; GCN-NEXT: s_cmp_lg_u32 s0, 19 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: v_mov_b32_e32 v16, s11 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 18 +; GCN-NEXT: s_cmp_lg_u32 s0, 18 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: v_mov_b32_e32 v17, s10 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1741,11 +1739,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 17 +; GCN-NEXT: s_cmp_lg_u32 s0, 17 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_mov_b32_e32 v17, s9 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 16 +; GCN-NEXT: s_cmp_lg_u32 s0, 16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_mov_b32_e32 v19, s8 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1757,24 +1755,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v15, 4, v15 ; GCN-NEXT: v_and_b32_e32 v16, 15, v16 -; GCN-NEXT: s_cmp_lg_u32 s2, 31 +; GCN-NEXT: s_cmp_lg_u32 s0, 31 ; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v16, 7, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 30 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 30 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 6, s1 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 29 +; GCN-NEXT: s_cmp_lg_u32 s0, 29 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 5, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 28 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 28 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 4, s1 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1782,24 +1780,24 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 ; GCN-NEXT: v_or_b32_e32 v17, v19, v17 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 27 +; GCN-NEXT: s_cmp_lg_u32 s0, 27 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v17, 3, s1 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 26 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 26 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 2, s1 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc ; GCN-NEXT: v_and_b32_e32 v19, 1, v19 -; GCN-NEXT: s_cmp_lg_u32 s2, 24 -; GCN-NEXT: v_mov_b32_e32 v18, s3 +; GCN-NEXT: s_cmp_lg_u32 s0, 24 +; GCN-NEXT: v_mov_b32_e32 v18, s1 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 3, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v19, 2, v19 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 25 +; GCN-NEXT: s_cmp_lg_u32 s0, 25 ; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s3 +; GCN-NEXT: v_lshrrev_b16_e64 v19, 1, s1 ; GCN-NEXT: v_cndmask_b32_e32 v18, 1, v18, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v19, 1, v19, vcc @@ -1811,11 +1809,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v16, 12, v16 ; GCN-NEXT: v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 15 +; GCN-NEXT: s_cmp_lg_u32 s0, 15 ; GCN-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NEXT: v_lshrrev_b16_e64 v16, 15, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 14 +; GCN-NEXT: s_cmp_lg_u32 s0, 14 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 14, s4 ; GCN-NEXT: v_cndmask_b32_e32 v16, 1, v16, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1823,11 +1821,11 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_lshlrev_b16_e32 v16, 3, v16 ; GCN-NEXT: v_lshlrev_b16_e32 v17, 2, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 13 +; GCN-NEXT: s_cmp_lg_u32 s0, 13 ; GCN-NEXT: v_or_b32_e32 v16, v16, v17 ; GCN-NEXT: v_lshrrev_b16_e64 v17, 13, s4 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 12 +; GCN-NEXT: s_cmp_lg_u32 s0, 12 ; GCN-NEXT: v_lshrrev_b16_e64 v18, 12, s4 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v17, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1835,52 +1833,52 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_lshlrev_b16_e32 v17, 1, v17 ; GCN-NEXT: v_and_b32_e32 v18, 1, v18 ; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: s_cmp_lg_u32 s2, 11 +; GCN-NEXT: s_cmp_lg_u32 s0, 11 ; GCN-NEXT: v_lshrrev_b16_e64 v19, 11, s4 ; GCN-NEXT: v_and_b32_e32 v17, 3, v17 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 10 +; GCN-NEXT: s_cmp_lg_u32 s0, 10 ; GCN-NEXT: v_lshrrev_b16_e64 v14, 10, s4 ; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: v_cndmask_b32_e32 v17, 1, v19, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 9 +; GCN-NEXT: s_cmp_lg_u32 s0, 9 ; GCN-NEXT: v_lshrrev_b16_e64 v12, 9, s4 ; GCN-NEXT: v_cndmask_b32_e32 v14, 1, v14, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 8 +; GCN-NEXT: s_cmp_lg_u32 s0, 8 ; GCN-NEXT: v_lshrrev_b16_e64 v11, 8, s4 ; GCN-NEXT: v_cndmask_b32_e32 v12, 1, v12, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 7 +; GCN-NEXT: s_cmp_lg_u32 s0, 7 ; GCN-NEXT: v_lshrrev_b16_e64 v10, 7, s4 ; GCN-NEXT: v_cndmask_b32_e32 v11, 1, v11, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 6 +; GCN-NEXT: s_cmp_lg_u32 s0, 6 ; GCN-NEXT: v_lshrrev_b16_e64 v9, 6, s4 ; GCN-NEXT: v_cndmask_b32_e32 v10, 1, v10, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 5 +; GCN-NEXT: s_cmp_lg_u32 s0, 5 ; GCN-NEXT: v_lshrrev_b16_e64 v8, 5, s4 ; GCN-NEXT: v_cndmask_b32_e32 v9, 1, v9, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 4 +; GCN-NEXT: s_cmp_lg_u32 s0, 4 ; GCN-NEXT: v_lshrrev_b16_e64 v7, 4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v8, 1, v8, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 3 +; GCN-NEXT: s_cmp_lg_u32 s0, 3 ; GCN-NEXT: v_lshrrev_b16_e64 v6, 3, s4 ; GCN-NEXT: v_cndmask_b32_e32 v7, 1, v7, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 2 +; GCN-NEXT: s_cmp_lg_u32 s0, 2 ; GCN-NEXT: v_lshrrev_b16_e64 v5, 2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v6, 1, v6, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 1 ; GCN-NEXT: v_lshrrev_b16_e64 v4, 1, s4 ; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1919,9 +1917,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_or_b32_e32 v11, v16, v11 ; GCN-NEXT: v_or_b32_e32 v0, v0, v7 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 2a8eac8712e52a..68427e8937bb94 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 @@ -40,7 +40,7 @@ define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -53,7 +53,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo ; ; VI-LABEL: insertelement_v2f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -71,7 +71,7 @@ define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x flo define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -84,7 +84,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 @@ -102,7 +102,7 @@ define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -115,7 +115,7 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: insertelement_v2i32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 @@ -135,8 +135,8 @@ define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32 define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -150,8 +150,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -170,8 +170,8 @@ define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s1, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -185,8 +185,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s1, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -205,8 +205,8 @@ define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -220,8 +220,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s2, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -240,8 +240,8 @@ define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -255,8 +255,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo ; ; VI-LABEL: insertelement_v4f32_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -275,8 +275,8 @@ define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x flo define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind { ; SI-LABEL: insertelement_v4i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_movk_i32 s0, 0x3e7 ; SI-NEXT: s_mov_b32 s7, 0x100f000 @@ -290,8 +290,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: insertelement_v4i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_movk_i32 s0, 0x3e7 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 @@ -310,8 +310,8 @@ define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -323,8 +323,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 @@ -341,8 +341,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x flo define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -354,8 +354,8 @@ define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x flo ; ; VI-LABEL: insertelement_v3f32_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 @@ -497,8 +497,8 @@ define <12 x float> @insertelement_to_v12f32_undef() nounwind { define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -516,8 +516,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -540,9 +540,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -564,9 +564,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -593,9 +593,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x4 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -621,9 +621,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -654,9 +654,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -677,9 +677,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -705,10 +705,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x18 -; SI-NEXT: s_load_dword s5, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -720,8 +720,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_mov_b32 m0, s5 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v9 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 @@ -731,10 +731,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x60 -; VI-NEXT: s_load_dword s5, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -745,8 +745,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_mov_b32 m0, s5 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, v9 @@ -762,10 +762,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 -; SI-NEXT: s_load_dword s6, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -777,9 +777,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: s_mov_b32 m0, s6 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_movreld_b32_e32 v0, v10 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -789,10 +789,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 -; VI-NEXT: s_load_dword s6, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -803,9 +803,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: s_mov_b32 m0, s6 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, v10 @@ -821,10 +821,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,8 +849,8 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -859,8 +859,8 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -883,10 +883,10 @@ define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -912,8 +912,8 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -922,8 +922,8 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -947,9 +947,9 @@ define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -980,9 +980,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1018,8 +1018,8 @@ define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1034,8 +1034,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1055,9 +1055,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1075,9 +1075,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1100,10 +1100,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dword s9, s[6:7], 0x11 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dword s9, s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1124,10 +1124,10 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dword s9, s[6:7], 0x44 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dword s9, s[4:5], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1153,9 +1153,9 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1175,9 +1175,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1202,10 +1202,10 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v9i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x18 -; SI-NEXT: s_load_dword s5, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,8 +1217,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_mov_b32 m0, s5 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1227,10 +1227,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; ; VI-LABEL: dynamic_insertelement_v9i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x60 -; VI-NEXT: s_load_dword s5, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1242,8 +1242,8 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_mov_b32 m0, s5 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1257,10 +1257,10 @@ define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v10i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x18 -; SI-NEXT: s_load_dword s6, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1272,9 +1272,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: v_mov_b32_e32 v9, s5 -; SI-NEXT: s_mov_b32 m0, s6 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: s_mov_b32 m0, s4 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1283,10 +1283,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v10i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x60 -; VI-NEXT: s_load_dword s6, s[6:7], 0x80 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -1297,9 +1297,9 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: s_mov_b32 m0, s6 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: s_mov_b32 m0, s4 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 @@ -1314,10 +1314,10 @@ define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v11i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1341,17 +1341,17 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v11i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -1374,10 +1374,10 @@ define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v12i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x18 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1402,17 +1402,17 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v12i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x60 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 @@ -1436,9 +1436,9 @@ define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1458,7 +1458,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 m0, s6 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1468,9 +1468,9 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; ; VI-LABEL: dynamic_insertelement_v16i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1490,7 +1490,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 @@ -1505,7 +1505,7 @@ define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, < define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1522,7 +1522,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1544,8 +1544,8 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s8, s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,8 +1565,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; ; VI-LABEL: dynamic_insertelement_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s8, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1592,33 +1592,33 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: v_mov_b32_e32 v0, 0xff ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: v_lshlrev_b16_e32 v0, s4, v0 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: v_lshlrev_b16_e32 v0, s5, v0 ; VI-NEXT: v_not_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v1, s5, v1 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 ; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1634,17 +1634,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x5050505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1654,17 +1654,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xff, s4 -; VI-NEXT: s_andn2_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: s_lshl_b32 s5, 0xff, s5 +; VI-NEXT: s_andn2_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s5, 0x5050505 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1679,34 +1679,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dword s5, s[6:7], 0xa +; SI-NEXT: s_load_dword s6, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dword s4, s[4:5], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s6, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_andn2_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s5, 0x5050505 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[6:7], 0x28 +; VI-NEXT: s_load_dword s6, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xff, s4 -; VI-NEXT: s_andn2_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, 3 +; VI-NEXT: s_lshl_b32 s5, 0xff, s5 +; VI-NEXT: s_andn2_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s5, 0x5050505 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1718,46 +1718,46 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind { ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_lshl_b32 s0, s4, 3 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b32 s0, s8, 3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; SI-NEXT: s_and_b32 s5, s1, 0x5050505 +; SI-NEXT: s_and_b32 s9, s1, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; SI-NEXT: s_and_b32 s4, s0, 0x5050505 -; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] +; SI-NEXT: s_and_b32 s8, s0, 0x5050505 +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_dynamic_insertelement_v8i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_mov_b32 s11, 0x1100f000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_lshl_b32 s0, s4, 3 -; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s8, 3 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; VI-NEXT: s_and_b32 s5, s1, 0x5050505 +; VI-NEXT: s_and_b32 s9, s1, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; VI-NEXT: s_and_b32 s4, s0, 0x5050505 -; VI-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3] +; VI-NEXT: s_and_b32 s8, s0, 0x5050505 +; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b @@ -1768,196 +1768,196 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x4 -; SI-NEXT: s_load_dword s4, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 +; SI-NEXT: s_load_dword s6, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s5, s11, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 15 +; SI-NEXT: s_lshr_b32 s4, s11, 24 +; SI-NEXT: s_cmp_lg_u32 s6, 15 +; SI-NEXT: s_cselect_b32 s4, s4, 5 +; SI-NEXT: s_lshl_b32 s4, s4, 24 +; SI-NEXT: s_lshr_b32 s5, s11, 16 +; SI-NEXT: s_cmp_lg_u32 s6, 14 ; SI-NEXT: s_cselect_b32 s5, s5, 5 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshr_b32 s6, s11, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 14 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_lshr_b32 s6, s11, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 13 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 12 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshr_b32 s5, s11, 8 +; SI-NEXT: s_cmp_lg_u32 s6, 13 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_cmp_lg_u32 s6, 12 ; SI-NEXT: s_cselect_b32 s7, s11, 5 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_lshr_b32 s6, s10, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 11 -; SI-NEXT: s_cselect_b32 s6, s6, 5 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshr_b32 s5, s10, 24 +; SI-NEXT: s_cmp_lg_u32 s6, 11 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshr_b32 s7, s10, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 10 +; SI-NEXT: s_cmp_lg_u32 s6, 10 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_lshr_b32 s7, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 9 +; SI-NEXT: s_cmp_lg_u32 s6, 9 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 8 +; SI-NEXT: s_cmp_lg_u32 s6, 8 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_or_b32 s7, s10, s7 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: s_lshr_b32 s7, s9, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 7 +; SI-NEXT: s_cmp_lg_u32 s6, 7 ; SI-NEXT: s_cselect_b32 s7, s7, 5 ; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshr_b32 s10, s9, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 6 +; SI-NEXT: s_cmp_lg_u32 s6, 6 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_or_b32 s7, s7, s10 ; SI-NEXT: s_lshr_b32 s10, s9, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 5 +; SI-NEXT: s_cmp_lg_u32 s6, 5 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 4 +; SI-NEXT: s_cmp_lg_u32 s6, 4 ; SI-NEXT: s_cselect_b32 s9, s9, 5 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_lshr_b32 s9, s8, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 3 +; SI-NEXT: s_cmp_lg_u32 s6, 3 ; SI-NEXT: s_cselect_b32 s9, s9, 5 ; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_lshr_b32 s10, s8, 16 -; SI-NEXT: s_cmp_lg_u32 s4, 2 +; SI-NEXT: s_cmp_lg_u32 s6, 2 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_lshr_b32 s10, s8, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 1 +; SI-NEXT: s_cmp_lg_u32 s6, 1 ; SI-NEXT: s_cselect_b32 s10, s10, 5 ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cselect_b32 s4, s8, 5 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_or_b32 s4, s4, s10 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s4, s4, s9 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cmp_lg_u32 s6, 0 +; SI-NEXT: s_cselect_b32 s6, s8, 5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x10 -; VI-NEXT: s_load_dword s4, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s11, 24 -; VI-NEXT: s_cmp_lg_u32 s4, 15 -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_lshr_b32 s4, s11, 24 +; VI-NEXT: s_cmp_lg_u32 s6, 15 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 14 +; VI-NEXT: s_lshr_b32 s4, s11, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 14 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s11, 8 +; VI-NEXT: s_lshr_b32 s4, s11, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 13 +; VI-NEXT: s_cmp_lg_u32 s6, 13 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 12 +; VI-NEXT: s_cmp_lg_u32 s6, 12 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s10, 24 +; VI-NEXT: s_lshr_b32 s4, s10, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 11 +; VI-NEXT: s_cmp_lg_u32 s6, 11 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 10 +; VI-NEXT: s_lshr_b32 s4, s10, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 10 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s10, 8 +; VI-NEXT: s_lshr_b32 s4, s10, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 9 +; VI-NEXT: s_cmp_lg_u32 s6, 9 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 8 +; VI-NEXT: s_cmp_lg_u32 s6, 8 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; VI-NEXT: s_lshr_b32 s5, s9, 24 +; VI-NEXT: s_lshr_b32 s4, s9, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 7 +; VI-NEXT: s_cmp_lg_u32 s6, 7 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 6 +; VI-NEXT: s_lshr_b32 s4, s9, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 6 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s9, 8 +; VI-NEXT: s_lshr_b32 s4, s9, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 5 +; VI-NEXT: s_cmp_lg_u32 s6, 5 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 4 +; VI-NEXT: s_cmp_lg_u32 s6, 4 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_lshr_b32 s5, s8, 24 +; VI-NEXT: s_lshr_b32 s4, s8, 24 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cmp_lg_u32 s4, 3 +; VI-NEXT: s_cmp_lg_u32 s6, 3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 16 -; VI-NEXT: s_cmp_lg_u32 s4, 2 +; VI-NEXT: s_lshr_b32 s4, s8, 16 +; VI-NEXT: s_cmp_lg_u32 s6, 2 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_lshr_b32 s5, s8, 8 +; VI-NEXT: s_lshr_b32 s4, s8, 8 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; VI-NEXT: s_cmp_lg_u32 s4, 1 +; VI-NEXT: s_cmp_lg_u32 s6, 1 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s8 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1977,26 +1977,26 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1 define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-LABEL: insert_split_bb: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_cbranch_scc0 .LBB42_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s5, s[2:3], 0x1 -; SI-NEXT: s_mov_b64 s[6:7], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; SI-NEXT: s_load_dword s7, s[2:3], 0x1 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz .LBB42_3 ; SI-NEXT: .LBB42_2: ; %if -; SI-NEXT: s_load_dword s5, s[2:3], 0x0 +; SI-NEXT: s_load_dword s7, s[2:3], 0x0 ; SI-NEXT: .LBB42_3: ; %endif ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB42_4: @@ -2004,23 +2004,23 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc0 .LBB42_4 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_load_dword s5, s[2:3], 0x4 +; VI-NEXT: s_load_dword s7, s[2:3], 0x4 ; VI-NEXT: s_cbranch_execnz .LBB42_3 ; VI-NEXT: .LBB42_2: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s5, s[2:3], 0x0 +; VI-NEXT: s_load_dword s7, s[2:3], 0x0 ; VI-NEXT: .LBB42_3: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB42_4: @@ -2050,9 +2050,9 @@ endif: define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x18 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x18 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2071,9 +2071,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x60 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x60 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2097,9 +2097,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[6:7], 0x8 -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2118,9 +2118,9 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[6:7], 0x20 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2144,20 +2144,20 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0xc +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xc ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s12, 1 +; SI-NEXT: s_cmp_eq_u32 s6, 1 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_cselect_b32 s6, 0, s11 -; SI-NEXT: s_cselect_b32 s7, 5, s10 -; SI-NEXT: s_cmp_eq_u32 s12, 0 +; SI-NEXT: s_cselect_b32 s7, 0, s11 +; SI-NEXT: s_cselect_b32 s10, 5, s10 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: s_cselect_b32 s9, 0, s9 ; SI-NEXT: s_cselect_b32 s8, 5, s8 -; SI-NEXT: s_cmp_eq_u32 s12, 2 +; SI-NEXT: s_cmp_eq_u32 s6, 2 ; SI-NEXT: s_cselect_b32 s5, 0, s5 ; SI-NEXT: s_cselect_b32 s4, 5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -2165,27 +2165,27 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s12, s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x30 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s12, 1 +; VI-NEXT: s_cmp_eq_u32 s6, 1 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_cselect_b32 s6, 0, s11 -; VI-NEXT: s_cselect_b32 s7, 5, s10 -; VI-NEXT: s_cmp_eq_u32 s12, 0 +; VI-NEXT: s_cselect_b32 s7, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 5, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: s_cselect_b32 s9, 0, s9 ; VI-NEXT: s_cselect_b32 s8, 5, s8 -; VI-NEXT: s_cmp_eq_u32 s12, 2 +; VI-NEXT: s_cmp_eq_u32 s6, 2 ; VI-NEXT: s_cselect_b32 s5, 0, s5 ; VI-NEXT: s_cselect_b32 s4, 5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2193,8 +2193,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i64> %a, i64 5, i32 %b @@ -2205,67 +2205,67 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x10 -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s4, 1 -; SI-NEXT: s_cselect_b32 s5, 0x40200000, s11 -; SI-NEXT: s_cselect_b32 s6, 0, s10 -; SI-NEXT: s_cmp_eq_u32 s4, 0 +; SI-NEXT: s_cmp_eq_u32 s6, 1 +; SI-NEXT: s_cselect_b32 s4, 0x40200000, s11 +; SI-NEXT: s_cselect_b32 s5, 0, s10 +; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: s_cselect_b32 s7, 0x40200000, s9 ; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cmp_eq_u32 s4, 3 +; SI-NEXT: s_cmp_eq_u32 s6, 3 ; SI-NEXT: s_cselect_b32 s9, 0x40200000, s15 ; SI-NEXT: s_cselect_b32 s10, 0, s14 -; SI-NEXT: s_cmp_eq_u32 s4, 2 -; SI-NEXT: s_cselect_b32 s4, 0x40200000, s13 +; SI-NEXT: s_cmp_eq_u32 s6, 2 +; SI-NEXT: s_cselect_b32 s6, 0x40200000, s13 ; SI-NEXT: s_cselect_b32 s11, 0, s12 ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x40 -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, 1 -; VI-NEXT: s_cselect_b32 s5, 0x40200000, s11 -; VI-NEXT: s_cselect_b32 s6, 0, s10 -; VI-NEXT: s_cmp_eq_u32 s4, 0 +; VI-NEXT: s_cmp_eq_u32 s6, 1 +; VI-NEXT: s_cselect_b32 s4, 0x40200000, s11 +; VI-NEXT: s_cselect_b32 s5, 0, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: s_cselect_b32 s7, 0x40200000, s9 ; VI-NEXT: s_cselect_b32 s8, 0, s8 -; VI-NEXT: s_cmp_eq_u32 s4, 3 +; VI-NEXT: s_cmp_eq_u32 s6, 3 ; VI-NEXT: s_cselect_b32 s9, 0x40200000, s15 ; VI-NEXT: s_cselect_b32 s10, 0, s14 -; VI-NEXT: s_cmp_eq_u32 s4, 2 -; VI-NEXT: s_cselect_b32 s4, 0x40200000, s13 +; VI-NEXT: s_cmp_eq_u32 s6, 2 +; VI-NEXT: s_cselect_b32 s6, 0x40200000, s13 ; VI-NEXT: s_cselect_b32 s11, 0, s12 ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x double> %a, double 8.0, i32 %b @@ -2276,13 +2276,13 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[6:7], 0x20 -; SI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x10 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 1 +; SI-NEXT: s_lshl_b32 s4, s6, 1 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 @@ -2311,13 +2311,13 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[6:7], 0x80 -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 1 +; VI-NEXT: s_lshl_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 3135addec16183..c9b01eb5a97255 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -58,6 +58,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0 store <2 x bfloat> %vecins, ptr addrspace(1) %out @@ -67,7 +68,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -81,7 +82,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -95,7 +96,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: s_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -107,7 +108,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: s_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -116,6 +117,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1 store <2 x bfloat> %vecins, ptr addrspace(1) %out @@ -125,7 +127,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +144,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -160,7 +162,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x40a0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -173,8 +175,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x40a0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -184,6 +185,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -197,7 +199,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -214,7 +216,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -232,7 +234,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dword v1, v0, s[2:3] @@ -244,8 +246,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dword v1, v0, s[2:3] @@ -254,6 +255,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; GFX940-NEXT: v_bfi_b32 v1, s2, 53, v1 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -267,7 +269,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -284,7 +286,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v2bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -302,7 +304,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v2bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -315,8 +317,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v2bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -326,6 +327,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; GFX940-NEXT: v_perm_b32 v1, s2, v1, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -339,7 +341,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -356,7 +358,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2bf16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -374,7 +376,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX900-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -386,8 +388,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX940-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -396,6 +397,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; GFX940-NEXT: v_perm_b32 v1, 35, v1, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -409,8 +411,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; SI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -432,8 +434,8 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -459,11 +461,11 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dword v1, v0, s[4:5] +; GFX900-NEXT: global_load_dword v1, v0, s[6:7] ; GFX900-NEXT: global_load_dword v2, v0, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff ; GFX900-NEXT: s_waitcnt vmcnt(1) @@ -477,14 +479,13 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dword v1, v0, s[0:1] +; GFX940-NEXT: global_load_dword v1, v0, s[2:3] ; GFX940-NEXT: global_load_dword v2, v0, s[6:7] -; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX940-NEXT: v_lshlrev_b32_e64 v1, v1, s0 @@ -493,6 +494,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; GFX940-NEXT: v_bfi_b32 v1, v1, s0, v2 ; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -508,27 +510,27 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0xc -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0xc +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s5, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v2, s5, v4, v2 +; SI-NEXT: v_bfi_b32 v2, s4, v4, v2 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -546,13 +548,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_0: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff -; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_mov_b32_e32 v3, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -560,18 +562,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_0: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x30 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff +; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s0 +; GFX940-NEXT: v_mov_b32_e32 v3, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v0, s1, v3, v0 +; GFX940-NEXT: v_bfi_b32 v0, s0, v3, v0 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -587,17 +589,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 @@ -606,8 +608,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -625,30 +627,30 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_1: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_perm_b32 v0, s4, v0, v3 +; GFX900-NEXT: v_perm_b32 v0, s6, v0, v3 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v4bf16_1: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v0, s0, v0, v3 +; GFX940-NEXT: v_perm_b32 v0, s2, v0, v3 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -664,27 +666,27 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0xc -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0xc +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s5, 0xffff -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 +; SI-NEXT: v_bfi_b32 v3, s4, v4, v3 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4bf16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -702,13 +704,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_2: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_mov_b32 s2, 0xffff -; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_mov_b32_e32 v3, s6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -716,18 +718,18 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX940-LABEL: v_insertelement_v4bf16_2: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x30 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x30 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff +; GFX940-NEXT: s_mov_b32 s0, 0xffff ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v3, s0 +; GFX940-NEXT: v_mov_b32_e32 v3, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v1, s1, v3, v1 +; GFX940-NEXT: v_bfi_b32 v1, s0, v3, v1 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -743,17 +745,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; SI-LABEL: v_insertelement_v4bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 @@ -762,8 +764,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v4bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -781,30 +783,30 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v4bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_perm_b32 v1, s4, v1, v3 +; GFX900-NEXT: v_perm_b32 v1, s6, v1, v3 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v4bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v3 +; GFX940-NEXT: v_perm_b32 v1, s2, v1, v3 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -820,23 +822,23 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; SI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s6, s4, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s5, 4 -; SI-NEXT: s_or_b32 s6, s4, s6 -; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s5 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_and_b32 s5, s8, 0xffff +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_lshl_b32 s6, s9, 4 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfi_b32 v3, s5, v4, v3 ; SI-NEXT: v_bfi_b32 v2, s4, v5, v2 @@ -845,8 +847,8 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; VI-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -871,13 +873,13 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX900-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX900-NEXT: s_lshl_b32 s2, s5, 4 -; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: s_lshl_b32 s2, s7, 4 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 ; GFX900-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 ; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: v_mov_b32_e32 v4, s4 @@ -889,15 +891,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX940-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] -; GFX940-NEXT: s_lshl_b32 s1, s1, 4 -; GFX940-NEXT: s_pack_ll_b32_b16 s2, s0, s0 -; GFX940-NEXT: s_lshl_b64 s[0:1], 0xffff, s1 +; GFX940-NEXT: s_lshl_b32 s0, s3, 4 +; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, s2 +; GFX940-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 ; GFX940-NEXT: v_mov_b32_e32 v3, s2 ; GFX940-NEXT: v_mov_b32_e32 v4, s2 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -905,6 +906,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; GFX940-NEXT: v_bfi_b32 v0, s0, v4, v0 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -920,17 +922,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v8bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -939,8 +941,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v8bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -959,8 +961,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX900-LABEL: v_insertelement_v8bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -970,27 +972,27 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; GFX900-NEXT: v_bfi_b32 v3, s2, v3, v3 ; GFX900-NEXT: v_bfi_b32 v2, s2, v2, v2 ; GFX900-NEXT: v_bfi_b32 v0, s2, v0, v0 -; GFX900-NEXT: v_perm_b32 v1, s4, v1, v5 +; GFX900-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-NEXT: s_endpgm ; ; GFX940-LABEL: v_insertelement_v8bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s1, 0xffff +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] -; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_bfi_b32 v3, s1, v3, v3 -; GFX940-NEXT: v_bfi_b32 v2, s1, v2, v2 -; GFX940-NEXT: v_bfi_b32 v0, s1, v0, v0 -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v5 +; GFX940-NEXT: v_bfi_b32 v3, s0, v3, v3 +; GFX940-NEXT: v_bfi_b32 v2, s0, v2, v2 +; GFX940-NEXT: v_bfi_b32 v0, s0, v0, v0 +; GFX940-NEXT: v_perm_b32 v1, s2, v1, v5 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1006,48 +1008,48 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v8bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 -; SI-NEXT: s_cmp_eq_u32 s5, 6 -; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_cmp_eq_u32 s9, 6 +; SI-NEXT: v_mov_b32_e32 v6, s8 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_cmp_eq_u32 s9, 7 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 4 +; SI-NEXT: s_cmp_eq_u32 s9, 4 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 5 +; SI-NEXT: s_cmp_eq_u32 s9, 5 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 2 +; SI-NEXT: s_cmp_eq_u32 s9, 2 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: s_cmp_eq_u32 s9, 3 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 0 +; SI-NEXT: s_cmp_eq_u32 s9, 0 ; SI-NEXT: v_or_b32_e32 v2, v2, v7 ; SI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 1 +; SI-NEXT: s_cmp_eq_u32 s9, 1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1063,8 +1065,8 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1117,40 +1119,40 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX900-LABEL: v_insertelement_v8bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v5, s4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 6 +; GFX900-NEXT: v_mov_b32_e32 v5, s6 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 +; GFX900-NEXT: s_cmp_eq_u32 s7, 7 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 4 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 +; GFX900-NEXT: s_cmp_eq_u32 s7, 5 ; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 +; GFX900-NEXT: s_cmp_eq_u32 s7, 2 ; GFX900-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 +; GFX900-NEXT: s_cmp_eq_u32 s7, 3 ; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 +; GFX900-NEXT: s_cmp_eq_u32 s7, 0 ; GFX900-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 +; GFX900-NEXT: s_cmp_eq_u32 s7, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1162,49 +1164,49 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX940-LABEL: v_insertelement_v8bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] -; GFX940-NEXT: s_cmp_eq_u32 s1, 6 -; GFX940-NEXT: v_mov_b32_e32 v5, s0 +; GFX940-NEXT: s_cmp_eq_u32 s3, 6 +; GFX940-NEXT: v_mov_b32_e32 v5, s2 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 7 +; GFX940-NEXT: s_cmp_eq_u32 s3, 7 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 4 +; GFX940-NEXT: s_cmp_eq_u32 s3, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 5 +; GFX940-NEXT: s_cmp_eq_u32 s3, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 2 -; GFX940-NEXT: v_perm_b32 v3, v3, v6, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 2 +; GFX940-NEXT: v_perm_b32 v3, v3, v6, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cmp_eq_u32 s3, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 0 -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 0 +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cmp_eq_u32 s3, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX940-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s0 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1220,18 +1222,18 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; SI-LABEL: v_insertelement_v16bf16_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dword s4, s[6:7], 0x4 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: v_mov_b32_e32 v9, 0 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[8:11], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[8:11], 0 addr64 offset:16 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_lshl_b32 s4, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -1242,8 +1244,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_insertelement_v16bf16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1269,15 +1271,15 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX900-LABEL: v_insertelement_v16bf16_3: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX900-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_perm_b32 v1, s4, v1, v9 +; GFX900-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1285,20 +1287,20 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX940-LABEL: v_insertelement_v16bf16_3: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dword s0, s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 ; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_perm_b32 v1, s0, v1, v9 +; GFX940-NEXT: v_perm_b32 v1, s2, v1, v9 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1314,21 +1316,22 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s11, 0x100f000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 +; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s5, 6 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s5, 7 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 @@ -1414,8 +1417,8 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -1511,74 +1514,74 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v9, s4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 6 +; GFX900-NEXT: v_mov_b32_e32 v9, s6 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 +; GFX900-NEXT: s_cmp_eq_u32 s7, 7 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc ; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 +; GFX900-NEXT: s_cmp_eq_u32 s7, 4 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 +; GFX900-NEXT: s_cmp_eq_u32 s7, 5 ; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 +; GFX900-NEXT: s_cmp_eq_u32 s7, 2 ; GFX900-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 +; GFX900-NEXT: s_cmp_eq_u32 s7, 3 ; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 +; GFX900-NEXT: s_cmp_eq_u32 s7, 0 ; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 +; GFX900-NEXT: s_cmp_eq_u32 s7, 1 ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 14 +; GFX900-NEXT: s_cmp_eq_u32 s7, 14 ; GFX900-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 15 +; GFX900-NEXT: s_cmp_eq_u32 s7, 15 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 12 +; GFX900-NEXT: s_cmp_eq_u32 s7, 12 ; GFX900-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 13 +; GFX900-NEXT: s_cmp_eq_u32 s7, 13 ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 10 +; GFX900-NEXT: s_cmp_eq_u32 s7, 10 ; GFX900-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 11 +; GFX900-NEXT: s_cmp_eq_u32 s7, 11 ; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 8 +; GFX900-NEXT: s_cmp_eq_u32 s7, 8 ; GFX900-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 9 +; GFX900-NEXT: s_cmp_eq_u32 s7, 9 ; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1591,84 +1594,84 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX940-LABEL: v_insertelement_v16bf16_dynamic: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:16 -; GFX940-NEXT: s_cmp_eq_u32 s1, 6 -; GFX940-NEXT: v_mov_b32_e32 v9, s0 +; GFX940-NEXT: s_cmp_eq_u32 s3, 6 +; GFX940-NEXT: v_mov_b32_e32 v9, s2 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 7 +; GFX940-NEXT: s_cmp_eq_u32 s3, 7 ; GFX940-NEXT: s_waitcnt vmcnt(1) ; GFX940-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 4 +; GFX940-NEXT: s_cmp_eq_u32 s3, 4 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 5 +; GFX940-NEXT: s_cmp_eq_u32 s3, 5 ; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 2 -; GFX940-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 2 +; GFX940-NEXT: v_perm_b32 v3, v3, v10, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 3 +; GFX940-NEXT: s_cmp_eq_u32 s3, 3 ; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 0 -; GFX940-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 0 +; GFX940-NEXT: v_perm_b32 v2, v10, v2, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 1 +; GFX940-NEXT: s_cmp_eq_u32 s3, 1 ; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 14 -; GFX940-NEXT: v_perm_b32 v1, v10, v1, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 14 +; GFX940-NEXT: v_perm_b32 v1, v10, v1, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 15 +; GFX940-NEXT: s_cmp_eq_u32 s3, 15 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 12 -; GFX940-NEXT: v_perm_b32 v0, v10, v0, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 12 +; GFX940-NEXT: v_perm_b32 v0, v10, v0, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 13 +; GFX940-NEXT: s_cmp_eq_u32 s3, 13 ; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 10 -; GFX940-NEXT: v_perm_b32 v7, v10, v7, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 10 +; GFX940-NEXT: v_perm_b32 v7, v10, v7, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 11 +; GFX940-NEXT: s_cmp_eq_u32 s3, 11 ; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 8 -; GFX940-NEXT: v_perm_b32 v6, v10, v6, s2 +; GFX940-NEXT: s_cmp_eq_u32 s3, 8 +; GFX940-NEXT: v_perm_b32 v6, v10, v6, s0 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX940-NEXT: s_cmp_eq_u32 s1, 9 +; GFX940-NEXT: s_cmp_eq_u32 s3, 9 ; GFX940-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX940-NEXT: v_perm_b32 v5, v10, v5, s2 -; GFX940-NEXT: v_perm_b32 v4, v9, v4, s2 +; GFX940-NEXT: v_perm_b32 v5, v10, v5, s0 +; GFX940-NEXT: v_perm_b32 v4, v9, v4, s0 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm +; %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 647870f0e08979..1ba2491d2210ec 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -54,21 +54,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -83,8 +83,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_0_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_0_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -121,14 +121,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART @@ -138,8 +138,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -179,8 +179,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -207,21 +207,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -235,8 +235,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -251,8 +251,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -274,12 +274,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -291,8 +291,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -310,8 +310,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -330,8 +330,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 @@ -359,12 +359,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 @@ -380,8 +380,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -402,8 +402,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -425,8 +425,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 @@ -462,7 +462,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -474,7 +474,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -488,7 +488,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -508,21 +508,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 { ; GFX9-LABEL: s_insertelement_v2i16_1_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_1_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -537,8 +537,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; ; CI-LABEL: s_insertelement_v2i16_1_reg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -554,8 +554,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; GFX11-LABEL: s_insertelement_v2i16_1_reg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -575,7 +575,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -588,7 +588,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -602,7 +602,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -623,7 +623,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -635,7 +635,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -649,7 +649,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: s_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -669,7 +669,7 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -718,9 +718,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -744,21 +742,21 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_reghi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, s6, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -776,8 +774,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -795,12 +793,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v2i16_0_reghi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -824,7 +819,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -836,7 +831,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -854,7 +849,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -872,9 +867,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -898,7 +891,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -911,7 +904,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2i16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -929,7 +922,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2i16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -947,9 +940,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2i16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -973,7 +964,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -985,7 +976,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1003,7 +994,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1021,9 +1012,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1046,7 +1035,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1059,7 +1048,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1077,7 +1066,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1095,9 +1084,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1121,7 +1108,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1133,7 +1120,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1151,7 +1138,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1169,9 +1156,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1194,7 +1179,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1207,7 +1192,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v2f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1225,7 +1210,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v2f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1243,9 +1228,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v2f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1269,7 +1252,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1281,7 +1264,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1299,7 +1282,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1317,9 +1300,7 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; ; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1343,16 +1324,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s6, 4 +; GFX9-NEXT: s_lshl_b32 s2, s4, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_andn2_b32 s3, s7, s2 +; GFX9-NEXT: s_andn2_b32 s3, s5, s2 ; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1361,10 +1342,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1380,10 +1361,10 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s4, s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[6:7], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1400,8 +1381,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-LABEL: s_insertelement_v2i16_dynamic: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -1428,13 +1409,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 { ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 @@ -1443,8 +1424,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1464,8 +1445,8 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1485,15 +1466,13 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 @@ -1514,11 +1493,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 { ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1532,8 +1511,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1559,8 +1538,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -1586,10 +1565,8 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1620,13 +1597,13 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1634,8 +1611,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1653,8 +1630,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_0: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1673,12 +1650,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1702,21 +1676,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, s4, v0, v3 +; GFX9-NEXT: v_perm_b32 v0, s6, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,8 +1708,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_1: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1754,12 +1728,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1783,13 +1754,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1797,8 +1768,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4f16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x30 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,8 +1787,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0xc +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1836,12 +1807,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x30 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1865,21 +1833,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, s4, v1, v3 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1897,8 +1865,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1917,12 +1885,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1946,13 +1911,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1960,8 +1925,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v4i16_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1979,8 +1944,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v4i16_2: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1999,12 +1964,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v4i16_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2029,11 +1991,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff @@ -2048,11 +2010,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -2075,11 +2037,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -2102,22 +2064,20 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 @@ -2139,13 +2099,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 { ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_lshl_b32 s2, s5, 4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX9-NEXT: s_lshl_b32 s2, s7, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 @@ -2157,8 +2117,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2183,8 +2143,8 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2209,12 +2169,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; ; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_lshl_b32 s1, s1, 4 @@ -2242,21 +2199,21 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, s4, v1, v5 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v8f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2275,8 +2232,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2295,12 +2252,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2324,13 +2278,13 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v8i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v3, s2, v5, v3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] @@ -2338,8 +2292,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_insertelement_v8i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2358,8 +2312,8 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; CI-LABEL: v_insertelement_v8i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2378,12 +2332,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_insertelement_v8i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2407,40 +2358,40 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v8f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 ; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2452,8 +2403,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v8f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2506,8 +2457,8 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v8f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2576,12 +2527,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v8f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_cmp_eq_u32 s1, 6 @@ -2637,15 +2585,15 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16f16_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_perm_b32 v1, s4, v1, v9 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2653,8 +2601,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16f16_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2680,8 +2628,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16f16_3: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 @@ -2707,12 +2655,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16f16_3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] @@ -2741,14 +2686,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { ; GFX9-LABEL: v_insertelement_v16i16_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfi_b32 v3, s2, v9, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2758,8 +2703,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: v_insertelement_v16i16_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dword s4, s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2784,8 +2729,8 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; CI-LABEL: v_insertelement_v16i16_6: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dword s4, s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2811,12 +2756,9 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_insertelement_v16i16_6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] @@ -2845,74 +2787,74 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 ; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 14 +; GFX9-NEXT: s_cmp_eq_u32 s7, 14 ; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 15 +; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 12 +; GFX9-NEXT: s_cmp_eq_u32 s7, 12 ; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 13 +; GFX9-NEXT: s_cmp_eq_u32 s7, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 10 +; GFX9-NEXT: s_cmp_eq_u32 s7, 10 ; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 11 +; GFX9-NEXT: s_cmp_eq_u32 s7, 11 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 8 +; GFX9-NEXT: s_cmp_eq_u32 s7, 8 ; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 9 +; GFX9-NEXT: s_cmp_eq_u32 s7, 9 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2925,8 +2867,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -3022,8 +2964,8 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; CI-LABEL: v_insertelement_v16f16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3152,12 +3094,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_insertelement_v16f16_dynamic: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index b9dc27cb7e0192..62b3fadf8b2ed1 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -438,7 +438,7 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -468,7 +468,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -498,7 +498,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX10-NEXT: s_sub_i32 s5, 0, s3 @@ -528,7 +528,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -558,7 +558,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX11-NEXT: s_sub_i32 s5, 0, s3 @@ -595,7 +595,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 @@ -694,19 +694,19 @@ main_body: define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; GFX9-LABEL: atomic_add_local: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b64 s[0:1], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB5_2: @@ -714,19 +714,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX90A-LABEL: atomic_add_local: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB5_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: s_mul_i32 s0, s0, 5 -; GFX90A-NEXT: v_mov_b32_e32 v1, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: s_mul_i32 s1, s1, 5 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: ds_add_u32 v0, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB5_2: @@ -734,18 +734,18 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX10-LABEL: atomic_add_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, exec_lo -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX10-NEXT: s_mov_b32 s2, exec_lo +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB5_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0 -; GFX10-NEXT: s_mul_i32 s0, s0, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s1, s2 +; GFX10-NEXT: s_mul_i32 s1, s1, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: ds_add_u32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -754,19 +754,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX9-FLATSCR-LABEL: atomic_add_local: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec -; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec +; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s1, 5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB5_2: @@ -774,19 +774,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX11-LABEL: atomic_add_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB5_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX11-NEXT: s_bcnt1_i32_b32 s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s0, s0, 5 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_mul_i32 s1, s1, 5 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: ds_add_u32 v0, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -795,19 +795,19 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { ; ; GFX12-LABEL: atomic_add_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB5_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX12-NEXT: s_bcnt1_i32_b32 s1, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s0, s0, 5 -; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1 +; GFX12-NEXT: s_mul_i32 s1, s1, 5 +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_u32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -898,10 +898,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -910,8 +910,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -927,10 +927,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB7_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -939,8 +939,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB7_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -951,26 +951,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: atomic_add_ret_local: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB7_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB7_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -986,10 +986,10 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dword s6, s[2:3], 0x2c +; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -998,8 +998,8 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB7_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1010,26 +1010,26 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: atomic_add_ret_local: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB7_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB7_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1041,27 +1041,27 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: atomic_add_ret_local: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB7_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB7_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -1088,10 +1088,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 @@ -1099,8 +1099,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1116,10 +1116,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: ; implicit-def: $vgpr1 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB8_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX90A-NEXT: s_mul_i32 s4, s4, 5 @@ -1127,8 +1127,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: .LBB8_2: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 @@ -1139,24 +1139,24 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: add_i32_constant: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, exec_lo +; GFX10-NEXT: s_mov_b32 s3, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 -; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB8_2 ; GFX10-NEXT: ; %bb.1: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1 -; GFX10-NEXT: s_mul_i32 s1, s1, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10-NEXT: s_mul_i32 s3, s3, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: .LBB8_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1172,10 +1172,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1 -; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x34 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5 @@ -1183,8 +1183,8 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: .LBB8_2: -; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 @@ -1195,25 +1195,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: add_i32_constant: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s1, 5 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_mul_i32 s3, s3, 5 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: .LBB8_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -1225,25 +1225,25 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: add_i32_constant: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_mov_b32 s0, exec_lo -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX12-NEXT: s_cbranch_execz .LBB8_2 ; GFX12-NEXT: ; %bb.1: -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s1, s1, 5 -; GFX12-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-NEXT: s_mul_i32 s3, s3, 5 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: .LBB8_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index 4575df1e0c6b95..ce3494f7391d6c 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -59,10 +59,10 @@ define void @func_regular_call() #1 { ; GCN-LABEL: {{^}}func_tail_call: ; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, -; GCN-NEXT: s_addc_u32 s17, -; GCN-NEXT: s_setpc_b64 s[16:17] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, +; GCN-NEXT: s_addc_u32 s5, +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN: ; NumSgprs: 32 ; GCN: ; NumVgprs: 8 diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index 496a1c652da251..2370ceff89bd57 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -7,11 +7,11 @@ declare void @llvm.trap() #0 ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_kernarg_size 8 -; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 6 ; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; DOORBELL: .end_amdhsa_kernel -define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 { +define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { store volatile i32 1, ptr addrspace(1) %arg0 call void @llvm.trap() unreachable @@ -19,7 +19,5 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 { ret void } -attributes #0 = { "amdgpu-no-implicitarg-ptr" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index f9073be7e260b8..69f181fcede30f 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { ; SI-LABEL: i8_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -20,10 +20,10 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; VI-LABEL: i8_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -32,8 +32,8 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw ; ; GFX9-LABEL: i8_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -80,11 +80,11 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { ; SI-LABEL: i8_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_and_b32 s4, s2, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -92,10 +92,10 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; VI-LABEL: i8_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xff +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -104,8 +104,8 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe ; ; GFX9-LABEL: i8_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xff @@ -155,11 +155,11 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { ; SI-LABEL: i8_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i8 s4, s4 +; SI-NEXT: s_sext_i32_i8 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -167,10 +167,10 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; VI-LABEL: i8_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i8 s2, s4 +; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -179,8 +179,8 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe ; ; GFX9-LABEL: i8_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -230,11 +230,11 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { ; SI-LABEL: i16_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -242,10 +242,10 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; VI-LABEL: i16_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -254,8 +254,8 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou ; ; GFX9-LABEL: i16_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -302,11 +302,11 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { ; SI-LABEL: i16_zext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -314,10 +314,10 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; VI-LABEL: i16_zext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -326,8 +326,8 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer ; ; GFX9-LABEL: i16_zext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -377,11 +377,11 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { ; SI-LABEL: i16_sext_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -389,10 +389,10 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; VI-LABEL: i16_sext_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s2, s4 +; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -401,8 +401,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig ; ; GFX9-LABEL: i16_sext_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -452,8 +452,8 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { ; SI-LABEL: i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -463,19 +463,19 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou ; ; VI-LABEL: i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -511,8 +511,8 @@ entry: define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { ; SI-LABEL: f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -522,19 +522,19 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n ; ; VI-LABEL: f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -570,8 +570,8 @@ entry: define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; SI-LABEL: v2i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -581,19 +581,19 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { ; ; VI-LABEL: v2i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -659,8 +659,8 @@ entry: define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; SI-LABEL: v2i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -670,19 +670,19 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { ; ; VI-LABEL: v2i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -718,7 +718,7 @@ entry: define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { ; SI-LABEL: v2i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +731,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -781,7 +781,7 @@ entry: define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { ; SI-LABEL: v2f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -805,7 +805,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -844,8 +844,8 @@ entry: define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { ; SI-LABEL: v3i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s5, s4, 16 @@ -858,26 +858,26 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; ; VI-LABEL: v3i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s4, 16 +; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_byte v[2:3], v5 ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -983,7 +983,7 @@ entry: define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { ; SI-LABEL: v3i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -998,7 +998,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1014,7 +1014,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1102,8 +1102,8 @@ entry: define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { ; SI-LABEL: v3i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1117,8 +1117,8 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; VI-LABEL: v3i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1130,14 +1130,14 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; ; GFX9-LABEL: v3i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3i32_arg: @@ -1181,8 +1181,8 @@ entry: define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { ; SI-LABEL: v3f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,8 +1196,8 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; VI-LABEL: v3f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -1209,14 +1209,14 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; ; GFX9-LABEL: v3f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3f32_arg: @@ -1260,8 +1260,8 @@ entry: define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; SI-LABEL: v4i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,19 +1271,19 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { ; ; VI-LABEL: v4i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1319,7 +1319,7 @@ entry: define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; SI-LABEL: v4i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -1343,7 +1343,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1382,8 +1382,8 @@ entry: define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { ; SI-LABEL: v4i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; VI-LABEL: v4i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1410,15 +1410,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; ; GFX9-LABEL: v4i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4i32_arg: @@ -1456,8 +1456,8 @@ entry: define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { ; SI-LABEL: v4f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,8 +1470,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; VI-LABEL: v4f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1484,15 +1484,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; ; GFX9-LABEL: v4f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4f32_arg: @@ -1530,7 +1530,7 @@ entry: define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -1671,50 +1671,50 @@ entry: define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { ; SI-LABEL: v5i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 8 +; VI-NEXT: s_add_u32 s4, s2, 8 ; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i16_arg: @@ -1902,27 +1902,27 @@ entry: define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind { ; SI-LABEL: v5i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s8, s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1941,9 +1941,9 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; ; GFX9-LABEL: v5i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s8, s[6:7], 0x30 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v5, s8 @@ -1951,8 +1951,8 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i32_arg: @@ -2000,27 +2000,27 @@ entry: define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind { ; SI-LABEL: v5f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[2:3], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s8, s[0:1], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dword s7, s[2:3], 0x54 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x44 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s7, s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -2039,19 +2039,19 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; ; GFX9-LABEL: v5f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x30 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f32_arg: @@ -2099,34 +2099,34 @@ entry: define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind { ; SI-LABEL: v5i64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2155,9 +2155,9 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2241,34 +2241,34 @@ entry: define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { ; SI-LABEL: v5f64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x21 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x84 -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 ; VI-NEXT: v_mov_b32_e32 v1, s10 @@ -2297,9 +2297,9 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x60 -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -2384,7 +2384,7 @@ entry: define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; SI-LABEL: v8i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2397,7 +2397,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2635,8 +2635,8 @@ entry: define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; SI-LABEL: v8i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,8 +2649,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; VI-LABEL: v8i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2663,15 +2663,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; GFX9-LABEL: v8i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i16_arg: @@ -2883,8 +2883,8 @@ entry: define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { ; SI-LABEL: v8i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2903,8 +2903,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; VI-LABEL: v8i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -2926,8 +2926,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; ; GFX9-LABEL: v8i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -2994,8 +2994,8 @@ entry: define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { ; SI-LABEL: v8f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3014,8 +3014,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; VI-LABEL: v8f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3037,8 +3037,8 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; ; GFX9-LABEL: v8f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -3106,8 +3106,8 @@ entry: define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; SI-LABEL: v16i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3120,8 +3120,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; VI-LABEL: v16i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -3134,15 +3134,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; ; GFX9-LABEL: v16i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v16i8_arg: @@ -3556,8 +3556,8 @@ entry: define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; SI-LABEL: v16i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -3576,8 +3576,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; VI-LABEL: v16i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s2, s0, 16 @@ -3599,8 +3599,8 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; GFX9-LABEL: v16i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s12 @@ -4012,8 +4012,8 @@ entry: define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { ; SI-LABEL: v16i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4044,8 +4044,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; VI-LABEL: v16i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4085,8 +4085,8 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; ; GFX9-LABEL: v16i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4200,8 +4200,8 @@ entry: define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { ; SI-LABEL: v16f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4232,8 +4232,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; VI-LABEL: v16f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_add_u32 s2, s0, 48 @@ -4273,8 +4273,8 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; ; GFX9-LABEL: v16f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 @@ -4388,7 +4388,7 @@ entry: define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: kernel_arg_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4401,7 +4401,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; VI-LABEL: kernel_arg_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4412,7 +4412,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin ; ; GFX9-LABEL: kernel_arg_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +4450,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; SI-LABEL: f64_kernel_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4463,7 +4463,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; VI-LABEL: f64_kernel_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -4474,7 +4474,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { ; ; GFX9-LABEL: f64_kernel_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4522,8 +4522,8 @@ entry: define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { ; SI-LABEL: i65_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s8, s4, 1 @@ -4539,8 +4539,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s4, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -4558,11 +4558,11 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_and_b32 s4, s6, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -4640,11 +4640,11 @@ entry: define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -4652,10 +4652,10 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; VI-LABEL: i1_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4664,8 +4664,8 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { ; ; GFX9-LABEL: i1_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4731,11 +4731,11 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, 1 +; SI-NEXT: s_and_b32 s4, s2, 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4743,10 +4743,10 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4755,8 +4755,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4803,8 +4803,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_zext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4816,11 +4816,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_zext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s2, s4, 1 +; VI-NEXT: s_and_b32 s2, s2, 1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -4829,8 +4829,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_zext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 @@ -4879,11 +4879,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x10000 +; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4891,10 +4891,10 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s2, s4, 0x10000 +; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -4903,8 +4903,8 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin ; ; GFX9-LABEL: i1_arg_sext_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 @@ -4953,11 +4953,11 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { ; SI-LABEL: i1_arg_sext_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -4966,21 +4966,21 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; ; VI-LABEL: i1_arg_sext_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i1_arg_sext_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 @@ -5062,10 +5062,10 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { ; SI-LABEL: struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s8, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb -; SI-NEXT: s_load_dword s9, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x11 +; SI-NEXT: s_load_dword s8, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dword s9, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -5089,46 +5089,46 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; ; VI-LABEL: struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x44 +; VI-NEXT: s_load_dword s4, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dword s5, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: struct_argument_alignment: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s5, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x20 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -5196,7 +5196,6 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { ; SI-LABEL: packed_struct_argument_alignment: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dword s6, s[0:1], 0x9 @@ -5230,37 +5229,37 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; ; VI-LABEL: packed_struct_argument_alignment: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 49 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s2, 50 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 3 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s2, 51 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s2, s0, 49 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s4, s0, 50 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_add_u32 s2, s2, 3 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s0, 51 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: s_add_u32 s0, s2, 53 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s2, s0, 53 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v7 @@ -5281,10 +5280,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; GFX9-LABEL: packed_struct_argument_alignment: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_dword v6, v2, s[6:7] offset:13 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] offset:17 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x4 +; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5380,11 +5379,11 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { ; SI-LABEL: struct_argument_alignment_after: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb -; SI-NEXT: s_load_dword s13, s[2:3], 0xf -; SI-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x11 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x15 +; SI-NEXT: s_load_dword s12, s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dword s13, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 ; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 @@ -5414,11 +5413,11 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; VI-LABEL: struct_argument_alignment_after: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s8, s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; VI-NEXT: s_load_dword s9, s[2:3], 0x3c -; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x44 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x54 +; VI-NEXT: s_load_dword s8, s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dword s9, s[0:1], 0x3c +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5446,19 +5445,19 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, ; ; GFX9-LABEL: struct_argument_alignment_after: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x30 +; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 @@ -5546,7 +5545,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; SI-LABEL: array_3xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5566,7 +5565,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5584,7 +5583,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -5660,8 +5659,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; SI-LABEL: array_3xi16: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] -; SI-NEXT: s_load_dword s4, s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 @@ -5681,22 +5679,22 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; ; VI-LABEL: array_3xi16: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 38 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s2, 42 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s2, s0, 38 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s0, 42 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_ushort v4, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5713,10 +5711,10 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GFX9-LABEL: array_3xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] offset:2 -; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 +; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -5831,7 +5829,6 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; SI-LABEL: small_array_round_down_offset: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 @@ -5842,8 +5839,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; ; VI-LABEL: small_array_round_down_offset: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s2, 37 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s0, s0, 37 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v0, v[0:1] @@ -5855,7 +5852,7 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { ; GFX9-LABEL: small_array_round_down_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5889,8 +5886,8 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_align_constant_i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x49 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -5904,13 +5901,13 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; VI-LABEL: byref_align_constant_i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x124 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -5919,8 +5916,8 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu ; ; GFX9-LABEL: byref_align_constant_i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x100 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -5973,83 +5970,83 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { ; SI-LABEL: byref_natural_align_constant_v16i32_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s20, s[2:3], 0x29 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x29 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: byref_natural_align_constant_v16i32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s20, s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_add_u32 s0, s2, 48 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 32 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s20 @@ -6059,9 +6056,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; ; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x80 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s20 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index f74f9a8f2bdd82..1a73df341108fe 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -10,28 +10,28 @@ ; GCN: s_and_b32 ; HSA-VI: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) #0 { +define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { store i1 %x, ptr addrspace(1) %out, align 1 ret void } ; FUNC-LABEL: {{^}}v3i8_arg: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x8 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; HSA-VI: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) #0 { +define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, ptr addrspace(1) %out, align 4 ret void } ; FUNC-LABEL: {{^}}i65_arg: -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; HSA-VI: .amdhsa_kernarg_size 24 -define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) #0 { +define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { entry: store i65 %in, ptr addrspace(1) %out, align 4 ret void @@ -39,7 +39,7 @@ entry: ; FUNC-LABEL: {{^}}empty_struct_arg: ; HSA-VI: .amdhsa_kernarg_size 0 -define amdgpu_kernel void @empty_struct_arg({} %in) #0 { +define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ret void } @@ -54,13 +54,13 @@ define amdgpu_kernel void @empty_struct_arg({} %in) #0 { ; FIXME: Total argument size is computed wrong ; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 ; HSA-VI: .amdhsa_kernarg_size 40 -define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) #0 { +define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 %val2 = extractvalue {i32, i64} %arg1, 0 @@ -78,11 +78,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 ; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 ; HSA-VI: .amdhsa_kernarg_size 28 -define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) #0 { +define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 %val2 = extractvalue <{i32, i64}> %arg1, 0 @@ -95,14 +95,14 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, } ; GCN-LABEL: {{^}}struct_argument_alignment_after: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x30 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 ; HSA-VI: .amdhsa_kernarg_size 64 -define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) #0 { +define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 %val2 = extractvalue {i32, i64} %arg2, 0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, } ; GCN-LABEL: {{^}}array_3xi32: -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { store volatile i16 %arg0, ptr addrspace(1) undef store volatile [3 x i32] %arg1, ptr addrspace(1) undef @@ -124,7 +124,7 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { } ; GCN-LABEL: {{^}}array_3xi16: -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, ptr addrspace(1) undef store volatile [3 x i16] %arg1, ptr addrspace(1) undef @@ -135,7 +135,7 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; GCN: s_load_dword [[DWORD:s[0-9]+]] ; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}} ; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}} -define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) #0 { +define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) { entry: store <2 x i15> %in, ptr addrspace(1) %out, align 4 ret void @@ -147,7 +147,7 @@ entry: ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: s_or_b32 -define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) #0 { +define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) { entry: store <3 x i15> %in, ptr addrspace(1) %out, align 4 ret void @@ -156,9 +156,9 @@ entry: ; Byref pointers should only be treated as offsets from kernarg ; GCN-LABEL: {{^}}byref_constant_i8_arg: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 +; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) { %in = load i8, ptr addrspace(4) %in.byref %ext = zext i8 %in to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -167,9 +167,9 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out ; GCN-LABEL: {{^}}byref_constant_i16_arg: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 +; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) { %in = load i16, ptr addrspace(4) %in.byref %ext = zext i16 %in to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -177,9 +177,9 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou } ; GCN-LABEL: {{^}}byref_constant_i32_arg: -; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[8:9], 0x0{{$}} +; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}} ; GCN: .amdhsa_kernarg_size 16 -define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) { %in = load i32, ptr addrspace(4) %in.byref store volatile i32 %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -187,10 +187,10 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou } ; GCN-LABEL: {{^}}byref_constant_v4i32_arg: -; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x10{{$}} -; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x20{{$}} +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}} ; GCN: .amdhsa_kernarg_size 36 -define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) { %in = load <4 x i32>, ptr addrspace(4) %in.byref store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -198,13 +198,13 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture % } ; GCN-LABEL: {{^}}byref_align_constant_i32_arg: -; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[8:9], 0x100{{$}} +; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[4:5], 0x100{{$}} ; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], s[[IN]] ; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], s[[AFTER_OFFSET]] ; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s ; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s ; GCN: .amdhsa_kernarg_size 264 -define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { %in = load i32, ptr addrspace(4) %in.byref store volatile i32 %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -212,10 +212,10 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu } ; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: -; GCN-DAG: s_load_dword s{{[0-9]+}}, s[8:9], 0x80 -; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x40{{$}} +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 +; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} ; GCN: .amdhsa_kernarg_size 132 -define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { %in = load <16 x i32>, ptr addrspace(4) %in.byref store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 @@ -224,9 +224,9 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; Also accept byref kernel arguments with other global address spaces. ; GCN-LABEL: {{^}}byref_global_i32_arg: -; GCN: s_load_dword [[IN:s[0-9]+]], s[8:9], 0x8{{$}} +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} ; GCN: .amdhsa_kernarg_size 12 -define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) { %in = load i32, ptr addrspace(1) %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void @@ -234,17 +234,17 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ; GCN-LABEL: {{^}}byref_flat_i32_arg: ; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} -define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) { %in = load i32, ptr %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void } ; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg: -; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s8, 8 +; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8 ; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} ; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}} -define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) { %in = load i32, ptr addrspace(6) %in.byref store i32 %in, ptr addrspace(1) %out, align 4 ret void @@ -257,9 +257,9 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu ; } ; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: -; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 +; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 ; GCN: .amdhsa_kernarg_size 20 -define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) #0 { +define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) { %in0 = load i32, ptr addrspace(4) %in0.byref %in1 = load i32, ptr addrspace(4) %in1.byref store volatile i32 %in0, ptr addrspace(1) %out, align 4 @@ -271,15 +271,13 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu ; GCN-LABEL: {{^}}byref_constant_i32_arg_offset0: ; GCN-NOT: s4 ; GCN-NOT: s5 -; GCN: s_load_dword {{s[0-9]+}}, s[8:9], 0x0{{$}} +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}} ; GCN: .amdhsa_kernarg_size 4 -define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) #0 { +define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) { %in = load i32, ptr addrspace(4) %in.byref store i32 %in, ptr addrspace(1) undef, align 4 ret void } -attributes #0 = { "amdgpu-no-implicitarg-ptr" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 0a70734a65c206..3e0ad65c498213 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -114,9 +114,9 @@ define amdgpu_ps void @only_kill() #0 { ; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm main_body: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 7698372b687797..cb6073e9341e04 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -26,20 +26,20 @@ ; GCNHSA: .amdhsa_group_segment_fixed_size 0 ; GCNHSA: .amdhsa_private_segment_fixed_size 32772 ; GCNHSA: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 1 -; GCNHSA: .amdhsa_user_sgpr_queue_ptr 1 +; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCNHSA: .amdhsa_user_sgpr_queue_ptr 0 ; GCNHSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCNHSA: .amdhsa_user_sgpr_dispatch_id 1 +; GCNHSA: .amdhsa_user_sgpr_dispatch_id 0 ; GCNHSA: .amdhsa_user_sgpr_flat_scratch_init 1 ; GCNHSA: .amdhsa_user_sgpr_private_segment_size 0 ; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GCNHSA: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 1 -; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 1 +; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 0 ; GCNHSA: .amdhsa_system_sgpr_workgroup_info 0 -; GCNHSA: .amdhsa_system_vgpr_workitem_id 2 +; GCNHSA: .amdhsa_system_vgpr_workitem_id 0 ; GCNHSA: .amdhsa_next_free_vgpr 3 -; GCNHSA: .amdhsa_next_free_sgpr 18 +; GCNHSA: .amdhsa_next_free_sgpr 10 ; GCNHSA: .amdhsa_float_round_mode_32 0 ; GCNHSA: .amdhsa_float_round_mode_16_64 0 ; GCNHSA: .amdhsa_float_denorm_mode_32 3 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 266ab687cd8d50..9619cb73b1538e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -93,7 +93,7 @@ define void @use_extern_overalign() #0 { define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -113,27 +113,23 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 4 @@ -156,7 +152,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -176,27 +172,23 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -219,7 +211,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -239,27 +231,23 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -282,7 +270,7 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -302,27 +290,23 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 -; CHECK-NEXT: s_load_dword s15, s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s12, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] -; CHECK-NEXT: s_lshl_b32 s4, s15, 2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] +; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 @@ -352,29 +336,25 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 0 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -385,37 +365,33 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 4 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 4 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -429,29 +405,25 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 2 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -462,37 +434,33 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 6 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 6 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -506,29 +474,25 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 1 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal @@ -539,37 +503,33 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 5 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 5 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable @@ -583,29 +543,25 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, 2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_mov_b32 s15, 3 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: ds_write_b16 v3, v4 +; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign @@ -616,37 +572,33 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_add_u32 s10, s10, s15 +; CHECK-NEXT: s_add_u32 s6, s6, s9 ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 -; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; CHECK-NEXT: s_add_u32 s0, s0, s9 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_add_u32 s8, s6, 8 -; CHECK-NEXT: s_addc_u32 s9, s7, 0 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_module@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_module@gotpcrel32@hi+12 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CHECK-NEXT: s_mov_b32 s15, 7 -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_add_u32 s8, s4, 8 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: s_mov_b32 s15, 7 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll index 9899d20cf3ae60..e1124f3ba89b5b 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: not llc -mtriple=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s @@ -11,21 +11,21 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) { ; GCN-LABEL: name: load_zeroinit_lds_global ; GCN: bb.0 (%ir-block.0): - ; GCN: liveins: $sgpr2_sgpr3 - ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 - ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 - ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 - ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 - ; GFX8: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 + ; GCN: liveins: $sgpr0_sgpr1 + ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 + ; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 + ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec ; GCN: SI_INIT_M0 -1, implicit-def $m0 ; GCN: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 killed [[V_MOV_B32_e32_]], 40, 0, implicit $m0, implicit $exec - ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec - ; GFX9: FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; GFX6: BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec + ; GFX8: FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr ; GCN: S_ENDPGM 0 %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 %ld = load i32, ptr addrspace(3) %gep diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index b231d307682ffe..161ec61ef1bdf8 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -192,6 +192,9 @@ ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O1-NEXT: AMDGPU Attributor +; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces ; GCN-O1-NEXT: Dominator Tree Construction @@ -469,6 +472,9 @@ ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O1-OPTS-NEXT: AMDGPU Attributor +; GCN-O1-OPTS-NEXT: FunctionPass Manager +; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -776,6 +782,9 @@ ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Lower OpenCL enqueued blocks ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O2-NEXT: AMDGPU Attributor +; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces ; GCN-O2-NEXT: Dominator Tree Construction @@ -1087,6 +1096,9 @@ ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Lower OpenCL enqueued blocks ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions +; GCN-O3-NEXT: AMDGPU Attributor +; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll index 9445f1225e0cbe..9b63a8a3efcf92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -9,7 +9,7 @@ define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -27,7 +27,7 @@ define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -44,7 +44,7 @@ define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -60,7 +60,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -79,7 +79,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -97,7 +97,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -114,7 +114,7 @@ define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 in ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -132,7 +132,7 @@ define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -149,7 +149,7 @@ define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] main_body: @@ -165,7 +165,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -184,7 +184,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -202,7 +202,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX12-NEXT: s_mov_b32 s4, 4 ; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index 61f0f20f057043..5a15dc53a292cd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -510,4 +510,4 @@ true: ret i32 42 false: ret i32 33 -} +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll index be270439ef57c4..ca7385be5dee7b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll index 50561de5bdbd20..b59e584418bd8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll index ce6336da4fd962..0093e30b036444 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll index 66b4f143c60d07..d896090a476651 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32: -; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x{{9|24}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index e1caf3bea61197..920ff8a927e2d1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 @@ -51,7 +51,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 @@ -67,8 +67,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -78,10 +78,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -89,33 +89,33 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, ; ; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4 +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -145,8 +145,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -188,13 +188,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -203,13 +203,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -218,10 +218,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -248,7 +246,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -264,7 +262,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -281,7 +279,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -292,7 +290,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -303,9 +301,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -328,7 +324,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -344,7 +340,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -361,7 +357,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -372,7 +368,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -383,9 +379,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -408,8 +402,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -428,8 +422,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -451,13 +445,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -466,13 +460,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -481,10 +475,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -512,8 +504,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -532,8 +524,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -555,13 +547,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -570,13 +562,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -585,10 +577,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -616,8 +606,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -636,8 +626,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -659,13 +649,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -674,13 +664,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -689,10 +679,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -721,8 +709,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -741,8 +729,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -764,13 +752,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -779,13 +767,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -794,10 +782,8 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp ; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll index 50f1beba252272..f8a1388c9415e7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll @@ -4,54 +4,19 @@ declare i64 @llvm.amdgcn.dispatch.id() #1 ; GCN-LABEL: {{^}}dispatch_id: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s10 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s11 -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 { - %tmp0 = call i64 @llvm.amdgcn.dispatch.id() - store i64 %tmp0, ptr addrspace(1) %out - ret void -} -; GCN-LABEL: {{^}}dispatch_id_opt0: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s8 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s9 -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id_opt0(ptr addrspace(1) %out) #2 { - %tmp0 = call i64 @llvm.amdgcn.dispatch.id() - store i64 %tmp0, ptr addrspace(1) %out - ret void -} - -; GCN-LABEL: {{^}}dispatch_id_opt1: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] ; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id_opt1(ptr addrspace(1) %out) #3 { +define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 { %tmp0 = call i64 @llvm.amdgcn.dispatch.id() store i64 %tmp0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}dispatch_id_opt2: -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]] -; GCN: .amdhsa_user_sgpr_dispatch_id 1 -define amdgpu_kernel void @dispatch_id_opt2() #4 { - %tmp0 = call i64 @llvm.amdgcn.dispatch.id() - store i64 %tmp0, ptr addrspace(1) null - ret void -} - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { "amdgpu-no-dispatch-ptr" } -attributes #3 = { "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } -attributes #4 = { "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll index dcbfef0acadca5..95e50da8a4709b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll @@ -17,5 +17,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { convergent inaccessiblememonly nounwind } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll index 18c711d0b2aecc..3b64a8707b55e8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -59,7 +59,6 @@ define amdgpu_kernel void @one_f32() #0 { define amdgpu_kernel void @id_i32() #0 { ; GFX11-LABEL: id_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 m0, 0 ; GFX11-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-NEXT: s_endpgm @@ -71,8 +70,7 @@ define amdgpu_kernel void @id_i32() #0 { define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { ; GFX11-LABEL: id_arg_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 m0, s0 ; GFX11-NEXT: exp pos0 v0, off, off, off done row_en @@ -86,19 +84,16 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { define amdgpu_kernel void @id_row_i32() #0 { ; GFX11-SDAG-LABEL: id_row_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: s_mov_b32 m0, s0 ; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: id_row_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0 ; GFX11-GISEL-NEXT: exp pos0 v1, off, off, off done row_en ; GFX11-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index a26b84e17374af..17b941c59fd3f3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -15,7 +15,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -27,7 +27,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -37,7 +37,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -50,7 +50,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -78,7 +78,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| @@ -126,7 +126,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -136,7 +136,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -150,10 +150,10 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -164,23 +164,23 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -191,13 +191,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -208,10 +208,10 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -222,23 +222,23 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -249,13 +249,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -266,10 +266,10 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -280,23 +280,23 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -307,13 +307,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -324,10 +324,10 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -338,23 +338,23 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -365,13 +365,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -382,10 +382,10 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -396,23 +396,23 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -423,13 +423,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -440,10 +440,10 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -454,23 +454,23 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -481,13 +481,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -498,10 +498,10 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -512,23 +512,23 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -539,13 +539,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -556,10 +556,10 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -570,23 +570,23 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -597,13 +597,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -614,10 +614,10 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -628,23 +628,23 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -655,13 +655,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -672,10 +672,10 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -686,23 +686,23 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -713,13 +713,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -730,10 +730,10 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -744,23 +744,23 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -771,13 +771,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -788,10 +788,10 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -802,23 +802,23 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -829,13 +829,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -846,10 +846,10 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -860,23 +860,23 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -887,13 +887,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -904,10 +904,10 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX11-LABEL: v_fcmp_f32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -918,23 +918,23 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; SDAG-GFX10-LABEL: v_fcmp_f32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -945,13 +945,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GISEL-GFX10-LABEL: v_fcmp_f32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13) store i32 %result, ptr addrspace(1) %out @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -973,7 +973,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -996,7 +996,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3] @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_one: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1023,7 +1023,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_one: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1033,7 +1033,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_one: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_one: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1061,7 +1061,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3] @@ -1111,7 +1111,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_oge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_oge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_oge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_oge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_olt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_olt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_olt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_olt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3] @@ -1211,7 +1211,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ole: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ole: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ole: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1246,7 +1246,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ole: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3] @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1273,7 +1273,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1296,7 +1296,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3] @@ -1311,7 +1311,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_o: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_o: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_o: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1346,7 +1346,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_o: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3] @@ -1361,7 +1361,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uo: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1373,7 +1373,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uo: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uo: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1396,7 +1396,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uo: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3] @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_une: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1423,7 +1423,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_une: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1433,7 +1433,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_une: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_une: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3] @@ -1461,7 +1461,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1523,7 +1523,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1546,7 +1546,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3] @@ -1561,7 +1561,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3] @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; SDAG-GFX11-LABEL: v_fcmp_f64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1623,7 +1623,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; SDAG-GFX10-LABEL: v_fcmp_f64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1646,7 +1646,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3] @@ -1663,12 +1663,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1678,26 +1678,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1707,14 +1707,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, s4, |s2| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half %temp, i32 1) @@ -1727,12 +1727,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; SDAG-GFX11-NEXT: s_nop 0 @@ -1742,26 +1742,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_nop 0 @@ -1771,14 +1771,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: s_lshr_b32 s2, s4, 16 -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, |s4|, |s2| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %temp = call half @llvm.fabs.f16(half %a) %src_input = call half @llvm.fabs.f16(half %src) @@ -1798,7 +1798,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX11-LABEL: v_fcmp_f16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1808,7 +1808,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GISEL-GFX10-LABEL: v_fcmp_f16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1823,10 +1823,10 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1837,23 +1837,23 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oeq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1864,13 +1864,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oeq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1) store i32 %result, ptr addrspace(1) %out @@ -1882,10 +1882,10 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_one: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1896,23 +1896,23 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_one: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_one: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1923,13 +1923,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_one: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6) store i32 %result, ptr addrspace(1) %out @@ -1941,10 +1941,10 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1955,23 +1955,23 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ogt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1982,13 +1982,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ogt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2) store i32 %result, ptr addrspace(1) %out @@ -2000,10 +2000,10 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_oge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2014,23 +2014,23 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_oge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_oge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2041,13 +2041,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_oge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3) store i32 %result, ptr addrspace(1) %out @@ -2059,10 +2059,10 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_olt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2073,23 +2073,23 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_olt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_olt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2100,13 +2100,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_olt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4) store i32 %result, ptr addrspace(1) %out @@ -2118,10 +2118,10 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ole: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2132,23 +2132,23 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ole: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ole: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2159,13 +2159,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ole: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5) store i32 %result, ptr addrspace(1) %out @@ -2177,10 +2177,10 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2191,23 +2191,23 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ueq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2218,13 +2218,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ueq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9) store i32 %result, ptr addrspace(1) %out @@ -2236,10 +2236,10 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_une: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2250,23 +2250,23 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_une: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_une: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2277,13 +2277,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_une: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14) store i32 %result, ptr addrspace(1) %out @@ -2295,10 +2295,10 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2309,23 +2309,23 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2336,13 +2336,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10) store i32 %result, ptr addrspace(1) %out @@ -2354,10 +2354,10 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2368,23 +2368,23 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2395,13 +2395,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11) store i32 %result, ptr addrspace(1) %out @@ -2413,10 +2413,10 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2427,23 +2427,23 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2454,13 +2454,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12) store i32 %result, ptr addrspace(1) %out @@ -2471,10 +2471,10 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_o: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2485,23 +2485,23 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_o: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_o: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2512,13 +2512,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_o: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7) store i32 %result, ptr addrspace(1) %out @@ -2529,10 +2529,10 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_uo: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2543,23 +2543,23 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_uo: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_uo: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2570,13 +2570,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_uo: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8) store i32 %result, ptr addrspace(1) %out @@ -2587,10 +2587,10 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX11-LABEL: v_fcmp_f16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2601,23 +2601,23 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; SDAG-GFX10-LABEL: v_fcmp_f16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 +; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2628,13 +2628,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GISEL-GFX10-LABEL: v_fcmp_f16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13) store i32 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 7e78d8b05d09f6..ce055d65279966 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -30,7 +30,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -42,7 +42,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| @@ -74,7 +74,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |s3| @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| @@ -137,7 +137,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -151,7 +151,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -163,7 +163,7 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -178,11 +178,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -193,24 +193,24 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -220,11 +220,11 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -240,11 +240,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -255,24 +255,24 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -282,11 +282,11 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -302,11 +302,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -317,24 +317,24 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -344,11 +344,11 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -364,11 +364,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -379,24 +379,24 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -406,11 +406,11 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -426,11 +426,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -441,24 +441,24 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -468,11 +468,11 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -488,11 +488,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -503,24 +503,24 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -530,11 +530,11 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -550,11 +550,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -565,24 +565,24 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -592,11 +592,11 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -612,11 +612,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -627,24 +627,24 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -654,11 +654,11 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -674,11 +674,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -689,24 +689,24 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -716,11 +716,11 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -736,11 +736,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -751,24 +751,24 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -778,11 +778,11 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -798,11 +798,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -813,24 +813,24 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -840,11 +840,11 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -860,11 +860,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -875,24 +875,24 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -902,11 +902,11 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -922,11 +922,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -937,24 +937,24 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -964,11 +964,11 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -984,11 +984,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; GFX11-LABEL: v_fcmp_f32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s4 +; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -999,24 +999,24 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; GFX9-LABEL: v_fcmp_f32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1026,11 +1026,11 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; ; VI-GISEL-LABEL: v_fcmp_f32_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oeq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1059,7 +1059,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1086,7 +1086,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1105,7 +1105,7 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_one: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1146,7 +1146,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1165,7 +1165,7 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ogt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1192,7 +1192,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1206,7 +1206,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1225,7 +1225,7 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_oge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1252,7 +1252,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_olt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,7 +1345,7 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ole: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1359,7 +1359,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1405,7 +1405,7 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ueq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1419,7 +1419,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1465,7 +1465,7 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_o: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1492,7 +1492,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1506,7 +1506,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1525,7 +1525,7 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1585,7 +1585,7 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_une: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1626,7 +1626,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1719,7 +1719,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1732,7 +1732,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1765,7 +1765,7 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1779,7 +1779,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1792,7 +1792,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1806,7 +1806,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1825,7 +1825,7 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) { define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; GFX11-LABEL: v_fcmp_f64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3] @@ -1839,7 +1839,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; GFX9-LABEL: v_fcmp_f64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1852,7 +1852,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-SDAG-LABEL: v_fcmp_f64_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) { ; ; VI-GISEL-LABEL: v_fcmp_f64_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,13 +1887,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1904,26 +1904,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; GFX9-LABEL: v_fcmp_f16_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -1933,12 +1933,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1956,13 +1956,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |s2| +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1973,26 +1973,26 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; GFX9-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_lshr_b32 s2, s4, 16 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2002,12 +2002,12 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_lshr_b32 s2, s4, 16 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s4|, |v0| +; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2028,7 +2028,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX11-GISEL-LABEL: v_fcmp_f16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -2042,7 +2042,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; GFX9-GISEL-LABEL: v_fcmp_f16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -2070,11 +2070,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oeq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2085,24 +2085,24 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oeq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oeq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2112,11 +2112,11 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oeq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2133,11 +2133,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_one: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2148,24 +2148,24 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_one: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_one: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2175,11 +2175,11 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_one: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2196,11 +2196,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ogt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2211,24 +2211,24 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ogt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ogt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2238,11 +2238,11 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ogt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2259,11 +2259,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_oge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2274,24 +2274,24 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_oge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_oge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2301,11 +2301,11 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_oge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2322,11 +2322,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_olt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2337,24 +2337,24 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_olt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_olt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2364,11 +2364,11 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_olt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2385,11 +2385,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ole: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2400,24 +2400,24 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ole: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ole: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ole: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2448,11 +2448,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ueq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,24 +2463,24 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ueq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ueq: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2490,11 +2490,11 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ueq: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2511,11 +2511,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_une: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2526,24 +2526,24 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_une: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_une: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2553,11 +2553,11 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_une: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2574,11 +2574,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2589,24 +2589,24 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ugt: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2616,11 +2616,11 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ugt: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2637,11 +2637,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2652,24 +2652,24 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uge: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2679,11 +2679,11 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uge: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2700,11 +2700,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2715,24 +2715,24 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ult: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2742,11 +2742,11 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ult: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2762,11 +2762,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_o: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2777,24 +2777,24 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_o: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_o: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2804,11 +2804,11 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_o: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2824,11 +2824,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_uo: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2839,24 +2839,24 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_uo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_uo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2866,11 +2866,11 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_uo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -2886,11 +2886,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; GFX11-LABEL: v_fcmp_f16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s4 +; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -2901,24 +2901,24 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; GFX9-LABEL: v_fcmp_f16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x5640 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f16_ule: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2928,11 +2928,11 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; ; VI-GISEL-LABEL: v_fcmp_f16_ule: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s4, v0 +; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 78d5da8dda177b..ca06a57be19ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -8,7 +8,7 @@ declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bf define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -34,17 +34,18 @@ entry: } define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( -; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: scratch_load_b32 v0, off, s2 -; GFX11-NEXT: scratch_load_u16 v1, off, s3 -; GFX11-NEXT: scratch_load_b32 v2, off, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11-NEXT: scratch_store_b16 off, v0, s0 -; GFX11-NEXT: s_endpgm +; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: +; SDAG-GFX11: ; %bb.0: ; %entry +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 +; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 +; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1 +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0 +; SDAG-GFX11-NEXT: s_endpgm +; ; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: ; GISEL-GFX11: ; %bb.0: ; %entry ; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 @@ -94,5 +95,3 @@ entry: } declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SDAG-GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 1343f25ec275e5..99c3deaada8c6b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c) define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] @@ -35,7 +35,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; SDAG-GFX11: ; %bb.0: ; %entry -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 ; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( ; ; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: ; GISEL-GFX11: ; %bb.0: ; %entry -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1 ; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 8a8b0490e9480b..e51b1d2da2e414 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -7,7 +7,7 @@ declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, floa define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -37,7 +37,7 @@ entry: define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll index e74485142fb6f0..d318bc80e49760 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -107,4 +107,4 @@ declare float @llvm.amdgcn.fmul.legacy(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "denormal-fp-math"="preserve-sign" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll index f631a0bfc28eb0..434fa1bf7b340b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll @@ -7,7 +7,7 @@ declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64) define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -16,7 +16,7 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) ; ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -31,12 +31,11 @@ entry: define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) { ; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 ; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -47,8 +46,8 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a ; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 0e659b758cd0fc..25f3ccc69f5adf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -10,7 +10,7 @@ declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 @@ -29,7 +29,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v8i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 @@ -48,7 +48,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v8f16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 @@ -67,7 +67,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v8bf16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index d941830e8dafc9..51ad0032f7ce1b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -10,7 +10,7 @@ declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1) define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b64_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 @@ -29,7 +29,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v4i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 @@ -48,7 +48,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v4f16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 @@ -67,7 +67,7 @@ entry: define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-LABEL: global_load_tr_b128_v4bf16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 9e3e393d82e223..309fd99031155d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -22,10 +22,10 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -36,23 +36,23 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -63,13 +63,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -87,7 +87,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -97,7 +97,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i32: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -111,10 +111,10 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -125,23 +125,23 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -152,13 +152,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -169,10 +169,10 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -183,23 +183,23 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -210,13 +210,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -227,10 +227,10 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -241,23 +241,23 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -268,13 +268,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -285,10 +285,10 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -299,23 +299,23 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -326,13 +326,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -343,10 +343,10 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -357,23 +357,23 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -384,13 +384,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -401,10 +401,10 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i32_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -415,23 +415,23 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i32_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -442,13 +442,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i32_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -459,10 +459,10 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -473,23 +473,23 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -500,13 +500,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -517,10 +517,10 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -531,23 +531,23 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -558,13 +558,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -575,10 +575,10 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX11-LABEL: v_icmp_i32_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -589,23 +589,23 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-GFX10-LABEL: v_icmp_i32_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i32_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -616,13 +616,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-GFX10-LABEL: v_icmp_i32_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -632,7 +632,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_eq: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -644,7 +644,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_eq: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -654,7 +654,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_eq: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -667,7 +667,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_eq: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3] @@ -682,7 +682,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_ne: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -694,7 +694,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_ne: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -704,7 +704,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_ne: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -717,7 +717,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_ne: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3] @@ -732,7 +732,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ugt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -744,7 +744,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ugt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -754,7 +754,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ugt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ugt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3] @@ -782,7 +782,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_uge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -794,7 +794,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_uge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -804,7 +804,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_uge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -817,7 +817,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_uge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3] @@ -832,7 +832,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ult: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -844,7 +844,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ult: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -854,7 +854,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ult: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -867,7 +867,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ult: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_u64_ule: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -894,7 +894,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_u64_ule: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -904,7 +904,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_u64_ule: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_u64_ule: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3] @@ -932,7 +932,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sgt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -944,7 +944,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sgt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -954,7 +954,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sgt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -967,7 +967,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sgt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3] @@ -982,7 +982,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sge: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,7 +994,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sge: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sge: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1017,7 +1017,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sge: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3] @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_slt: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_slt: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1054,7 +1054,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_slt: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_slt: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3] @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-GFX11-LABEL: v_icmp_i64_sle: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-GFX10-LABEL: v_icmp_i64_sle: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i64_sle: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1117,7 +1117,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i64_sle: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3] @@ -1133,10 +1133,10 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_eq: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1147,23 +1147,23 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_eq: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_eq: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1174,13 +1174,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_eq: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX10-LABEL: v_icmp_i16: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1] @@ -1222,10 +1222,10 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ne: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1236,23 +1236,23 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ne: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ne: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1263,13 +1263,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ne: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33) store i32 %result, ptr addrspace(1) %out @@ -1280,10 +1280,10 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ugt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1294,23 +1294,23 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ugt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ugt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1321,13 +1321,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ugt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34) store i32 %result, ptr addrspace(1) %out @@ -1338,10 +1338,10 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_uge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1352,23 +1352,23 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_uge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_uge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1379,13 +1379,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_uge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35) store i32 %result, ptr addrspace(1) %out @@ -1396,10 +1396,10 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ult: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1410,23 +1410,23 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ult: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ult: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1437,13 +1437,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ult: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36) store i32 %result, ptr addrspace(1) %out @@ -1454,10 +1454,10 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_ule: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1468,23 +1468,23 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_ule: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_ule: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1495,13 +1495,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_ule: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37) store i32 %result, ptr addrspace(1) %out @@ -1512,10 +1512,10 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX11-LABEL: v_icmp_i16_sgt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1526,23 +1526,23 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-GFX10-LABEL: v_icmp_i16_sgt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sgt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1553,13 +1553,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-GFX10-LABEL: v_icmp_i16_sgt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38) store i32 %result, ptr addrspace(1) %out @@ -1570,10 +1570,10 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sge: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1584,23 +1584,23 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sge: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sge: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1611,13 +1611,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sge: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39) store i32 %result, ptr addrspace(1) %out @@ -1628,10 +1628,10 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_slt: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1642,23 +1642,23 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_slt: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_slt: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1669,13 +1669,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_slt: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40) store i32 %result, ptr addrspace(1) %out @@ -1686,10 +1686,10 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX11-LABEL: v_icmp_i16_sle: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1700,23 +1700,23 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-GFX10-LABEL: v_icmp_i16_sle: ; SDAG-GFX10: ; %bb.0: ; SDAG-GFX10-NEXT: s_clause 0x1 -; SDAG-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_icmp_i16_sle: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 +; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2 ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1727,13 +1727,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-GFX10-LABEL: v_icmp_i16_sle: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_clause 0x1 -; GISEL-GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s4 -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX10-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41) store i32 %result, ptr addrspace(1) %out @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 @@ -1759,7 +1759,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_gt_u32 s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 60e242bf5b0e8f..5f979e0177f588 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -25,11 +25,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -40,11 +40,11 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -54,24 +54,24 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i32: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-VI-LABEL: v_icmp_i32: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i32: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -131,11 +131,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -146,11 +146,11 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -160,24 +160,24 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -193,11 +193,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -208,11 +208,11 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -222,24 +222,24 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -255,11 +255,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -270,11 +270,11 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -284,24 +284,24 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -317,11 +317,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -332,11 +332,11 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -346,24 +346,24 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -379,11 +379,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -394,11 +394,11 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -408,24 +408,24 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -441,11 +441,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GFX11-LABEL: v_icmp_i32_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -456,11 +456,11 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i32_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -470,24 +470,24 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; ; GFX9-LABEL: v_icmp_i32_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -503,11 +503,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -518,11 +518,11 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -532,24 +532,24 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -565,11 +565,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -580,11 +580,11 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -594,24 +594,24 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -627,11 +627,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GFX11-LABEL: v_icmp_i32_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -642,11 +642,11 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; SDAG-VI-LABEL: v_icmp_i32_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -656,24 +656,24 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; ; GFX9-LABEL: v_icmp_i32_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i32_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -688,7 +688,7 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_eq: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3] @@ -702,7 +702,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,7 +716,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -729,7 +729,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -748,7 +748,7 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_ne: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3] @@ -762,7 +762,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -776,7 +776,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -789,7 +789,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -808,7 +808,7 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ugt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3] @@ -822,7 +822,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -836,7 +836,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -849,7 +849,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,7 +868,7 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_uge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3] @@ -882,7 +882,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -928,7 +928,7 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ult: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3] @@ -942,7 +942,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -956,7 +956,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -969,7 +969,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -988,7 +988,7 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_u64_ule: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3] @@ -1002,7 +1002,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_u64_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_u64_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_u64_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sgt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1076,7 +1076,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1108,7 +1108,7 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sge: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3] @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_slt: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3] @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1196,7 +1196,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1209,7 +1209,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1228,7 +1228,7 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; GFX11-LABEL: v_icmp_i64_sle: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3] @@ -1242,7 +1242,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; SDAG-VI-LABEL: v_icmp_i64_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1256,7 +1256,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GFX9-LABEL: v_icmp_i64_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -1269,7 +1269,7 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; ; GISEL-VI-LABEL: v_icmp_i64_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1289,11 +1289,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_eq: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1304,11 +1304,11 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_eq: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1318,24 +1318,24 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_eq: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_eq: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1362,7 +1362,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX11-LABEL: v_icmp_i16: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-VI-LABEL: v_icmp_i16: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1381,7 +1381,7 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) { ; ; GISEL-GFX9-LABEL: v_icmp_i16: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] @@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ne: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1410,11 +1410,11 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ne: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1424,24 +1424,24 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ne: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ne: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1457,11 +1457,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ugt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1472,11 +1472,11 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ugt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1486,24 +1486,24 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ugt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ugt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1519,11 +1519,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_uge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1534,11 +1534,11 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_uge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1548,24 +1548,24 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_uge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_uge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1581,11 +1581,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ult: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1596,11 +1596,11 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ult: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1610,24 +1610,24 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ult: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ult: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1643,11 +1643,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_ule: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_ule: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1672,24 +1672,24 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_ule: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_ule: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1705,11 +1705,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GFX11-LABEL: v_icmp_i16_sgt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1720,11 +1720,11 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; SDAG-VI-LABEL: v_icmp_i16_sgt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1734,24 +1734,24 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; ; GFX9-LABEL: v_icmp_i16_sgt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sgt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,11 +1767,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sge: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sge: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1796,24 +1796,24 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sge: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sge: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1829,11 +1829,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_slt: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1844,11 +1844,11 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_slt: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1858,24 +1858,24 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_slt: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_slt: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1891,11 +1891,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GFX11-LABEL: v_icmp_i16_sle: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s4 +; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 @@ -1906,11 +1906,11 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; SDAG-VI-LABEL: v_icmp_i16_sle: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1920,24 +1920,24 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; ; GFX9-LABEL: v_icmp_i16_sle: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x64 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GISEL-VI-LABEL: v_icmp_i16_sle: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s4, v0 +; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1952,7 +1952,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 @@ -1970,7 +1970,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: v_icmp_i1_ne0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_u32 s2, 1 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -1986,7 +1986,7 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_gt_u32 s2, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 3168e05b816bee..dba67a03c000e5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,9 +14,8 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -152,11 +151,11 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -178,7 +177,6 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 -; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index f7f72ae31cc1db..70eff494501532 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -310,10 +310,10 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32> declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2 declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 -attributes #0 = { nounwind noinline "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } -attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } +attributes #0 = { nounwind noinline } +attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" } attributes #2 = { nounwind readnone speculatable } -attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-dispatch-ptr" "amdgpu-no-queue-ptr" } +attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 2d01703c78d78d..f1a4fe0f090b16 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,7 +163,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 @@ -189,7 +189,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -215,13 +215,11 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -230,8 +228,8 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -262,7 +260,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 @@ -285,7 +283,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 @@ -308,22 +306,21 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 -; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -352,9 +349,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -378,9 +375,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 @@ -404,16 +401,15 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; ; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 +; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -447,9 +443,9 @@ main_body: define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 @@ -470,9 +466,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 @@ -493,23 +489,21 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 0076079ce17c77..bc10eb68d75cbb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}is_private_vgpr: ; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]] ; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]] @@ -26,10 +26,10 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; select and vcc branch. ; GCN-LABEL: {{^}}is_private_sgpr: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x1{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x1{{$}} -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x32{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x32{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; CI: s_cmp_eq_u32 [[APERTURE]], [[PTR_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index e24c47991fe3d7..aad4d924952fff 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}is_local_vgpr: ; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] @@ -26,10 +26,10 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; select and vcc branch. ; GCN-LABEL: {{^}}is_local_sgpr: -; CI-DAG: s_load_dword s0, s[6:7], 0x1 +; CI-DAG: s_load_dword s0, s[4:5], 0x1 -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x33{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x33{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index ee005eb6e98410..8dba22312ac88c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -4,9 +4,9 @@ ; ALL-LABEL: {{^}}test: ; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; CO-V4: s_load_dword s{{[0-9]+}}, s[8:9], 0xa +; CO-V4: s_load_dword s{{[0-9]+}}, s[4:5], 0xa -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0xa +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa ; HSA: .amdhsa_kernarg_size 8 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 1 @@ -23,7 +23,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out) #1 { ; OS-MESA3D: kernarg_segment_alignment = 4 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 -; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[4:5], 0x15 +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15 ; HSA: .amdhsa_kernarg_size 8 define amdgpu_kernel void @test_implicit(ptr addrspace(1) %out) #1 { %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -78,7 +78,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(ptr addrspace(1) %out, ; HSA: .amdhsa_kernarg_size 0 ; HSA: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -define amdgpu_kernel void @test_no_kernargs() #4 { +define amdgpu_kernel void @test_no_kernargs() #1 { %kernarg.segment.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() %gep = getelementptr i32, ptr addrspace(4) %kernarg.segment.ptr, i64 10 %value = load i32, ptr addrspace(4) %gep @@ -123,7 +123,6 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "amdgpu-implicitarg-num-bytes"="0" } attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" } -attributes #4 = { nounwind "amdgpu-implicitarg-num-bytes"="0" "amdgpu-no-implicitarg-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index c201f84cac7268..61818dafd2b84c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,8 +23,8 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GCN-NEXT: s_add_i32 s2, s10, 42 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_add_i32 s2, s6, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -42,27 +42,21 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l ; GCN-LABEL: indirect_lds_id: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s8, s6, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_addc_u32 s9, s7, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, function_lds_id@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, function_lds_id@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: s_mov_b32 s15, 21 +; GCN-NEXT: s_mov_b32 s12, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm call void @function_lds_id(ptr addrspace(1) %out) @@ -72,7 +66,7 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 8e9a652ae8a8ef..1ae22c3eec185b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -15,20 +15,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,8 +41,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 % ; GFX12-LABEL: v_permlane16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -60,20 +60,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -86,8 +86,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float ; GFX12-LABEL: v_permlane16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -105,36 +105,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -149,8 +149,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -165,8 +165,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -202,36 +202,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -246,8 +246,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -262,8 +262,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -278,8 +278,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -299,22 +299,22 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -324,7 +324,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-LABEL: v_permlane16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -342,22 +342,22 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -367,7 +367,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float ; ; GFX12-LABEL: v_permlane16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -384,7 +384,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -396,7 +396,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -422,7 +422,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -436,7 +436,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -450,7 +450,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -469,7 +469,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -481,7 +481,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -493,7 +493,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -521,7 +521,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -535,7 +535,7 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -556,25 +556,25 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -583,7 +583,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-LABEL: v_permlane16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -601,7 +601,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -614,7 +614,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -627,7 +627,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -643,7 +643,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -659,7 +659,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -675,7 +675,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -697,25 +697,25 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; GFX10-LABEL: v_permlane16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -724,7 +724,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float ; ; GFX12-LABEL: v_permlane16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -742,7 +742,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -755,7 +755,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -768,7 +768,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -784,7 +784,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -800,7 +800,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -816,7 +816,7 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -838,33 +838,33 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX10-LABEL: v_permlane16_b32_vvv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -873,17 +873,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -891,7 +891,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -909,7 +909,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -933,7 +933,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -947,7 +947,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -961,7 +961,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX11-LABEL: v_permlane16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -981,7 +981,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 % ; ; GFX12-LABEL: v_permlane16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1009,33 +1009,33 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX10-LABEL: v_permlane16_b32_vvv_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1044,17 +1044,17 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1080,7 +1080,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -1104,7 +1104,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX11-LABEL: v_permlane16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl ; ; GFX12-LABEL: v_permlane16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -1179,7 +1179,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1201,12 +1201,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1215,12 +1215,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -1229,12 +1228,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -1243,12 +1242,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -1264,70 +1262,102 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvs_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i64 %v, ptr addrspace(1) %out @@ -1337,7 +1367,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -1348,7 +1378,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1359,12 +1389,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -1373,12 +1403,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -1387,12 +1416,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -1401,12 +1430,11 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -1422,70 +1450,102 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlane16_b32_vvs_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlane16_b32_vvs_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlane16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlane16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -1495,7 +1555,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -1506,7 +1566,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1517,7 +1577,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1532,7 +1592,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1547,7 +1607,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1562,7 +1622,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1584,38 +1644,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1633,8 +1695,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1652,8 +1714,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1671,8 +1733,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1695,7 +1757,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 % define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -1706,7 +1768,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1717,7 +1779,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1732,7 +1794,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1747,7 +1809,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -1762,7 +1824,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1784,38 +1846,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX10-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1833,8 +1897,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1852,8 +1916,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1871,8 +1935,8 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -1896,20 +1960,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1922,8 +1986,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlane16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1941,36 +2005,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -1985,8 +2049,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2001,8 +2065,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2017,8 +2081,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2038,20 +2102,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2064,8 +2128,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl ; GFX12-LABEL: v_permlane16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2083,36 +2147,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2127,8 +2191,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2143,8 +2207,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2159,8 +2223,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2180,20 +2244,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX10-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2206,8 +2270,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3 ; GFX12-LABEL: v_permlane16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2225,36 +2289,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2269,8 +2333,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2285,8 +2349,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2301,8 +2365,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6 ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2322,20 +2386,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX10-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2348,8 +2412,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl ; GFX12-LABEL: v_permlane16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2367,36 +2431,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2411,8 +2475,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2427,8 +2491,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2443,8 +2507,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2464,20 +2528,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2490,8 +2554,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2509,36 +2573,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2553,8 +2617,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2569,8 +2633,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2585,8 +2649,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2606,20 +2670,20 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2632,8 +2696,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2651,36 +2715,36 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2695,8 +2759,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2711,8 +2775,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2727,8 +2791,8 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2748,20 +2812,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vss_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2774,8 +2838,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32 ; GFX12-LABEL: v_permlanex16_b32_vss_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2793,20 +2857,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vss_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2819,8 +2883,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa ; GFX12-LABEL: v_permlanex16_b32_vss_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2838,36 +2902,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2882,8 +2946,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2898,8 +2962,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2914,8 +2978,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2935,36 +2999,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -2979,8 +3043,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2995,8 +3059,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -3011,8 +3075,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -3032,22 +3096,22 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vii_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vii_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -3057,7 +3121,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32 ; ; GFX12-LABEL: v_permlanex16_b32_vii_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3075,22 +3139,22 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vii_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vii_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -3100,7 +3164,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa ; ; GFX12-LABEL: v_permlanex16_b32_vii_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3117,7 +3181,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3129,7 +3193,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3141,7 +3205,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3155,7 +3219,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3169,7 +3233,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3183,7 +3247,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3202,7 +3266,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3214,7 +3278,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3226,7 +3290,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3240,7 +3304,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3254,7 +3318,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3268,7 +3332,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vii_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3289,25 +3353,25 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vll_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vll_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3316,7 +3380,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32 ; ; GFX12-LABEL: v_permlanex16_b32_vll_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3335,25 +3399,25 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vll_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_movk_i32 s2, 0x1234 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_movk_i32 s0, 0x1234 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vll_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_movk_i32 s2, 0x1234 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3362,7 +3426,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa ; ; GFX12-LABEL: v_permlanex16_b32_vll_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX12-NEXT: s_movk_i32 s2, 0x1234 @@ -3380,7 +3444,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3393,7 +3457,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3406,7 +3470,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3422,7 +3486,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3438,7 +3502,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3454,7 +3518,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_i64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3475,7 +3539,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3488,7 +3552,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3501,7 +3565,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3517,7 +3581,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3533,7 +3597,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 @@ -3549,7 +3613,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vll_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3571,33 +3635,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX10-LABEL: v_permlanex16_b32_vvv_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3606,17 +3670,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3624,7 +3688,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3642,7 +3706,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3667,33 +3731,33 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX10-LABEL: v_permlanex16_b32_vvv_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10-NEXT: v_readfirstlane_b32 s3, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3702,17 +3766,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3720,7 +3784,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -3738,7 +3802,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) @@ -3762,7 +3826,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -3776,7 +3840,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -3790,7 +3854,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX11-LABEL: v_permlanex16_b32_vvv_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3810,7 +3874,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 ; ; GFX12-LABEL: v_permlanex16_b32_vvv_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3837,7 +3901,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64 define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, double %src0) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -3851,7 +3915,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvv_f64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -3865,7 +3929,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX11-LABEL: v_permlanex16_b32_vvv_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 @@ -3885,7 +3949,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub ; ; GFX12-LABEL: v_permlanex16_b32_vvv_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -3912,7 +3976,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -3923,7 +3987,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3934,12 +3998,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -3948,12 +4012,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -3962,12 +4025,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -3976,12 +4039,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -3996,7 +4058,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, float %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 @@ -4007,7 +4069,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4018,12 +4080,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 @@ -4032,12 +4094,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 @@ -4046,12 +4107,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 @@ -4060,12 +4121,11 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 @@ -4081,144 +4141,208 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvs_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vvs_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm - %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) - store i64 %v, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, double %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s2 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s2 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s2 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s2 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_permlanex16_b32_vvs_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm ; -; GFX12-LABEL: v_permlanex16_b32_vvs_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s1, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s1, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_nop 0 -; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-NEXT: s_endpgm +; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs_f64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s1, s0 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s1, s0 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store double %v, ptr addrspace(1) %out @@ -4228,7 +4352,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -4239,7 +4363,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4250,7 +4374,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4265,7 +4389,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4280,7 +4404,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4295,7 +4419,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4316,7 +4440,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, float %src0, i32 %src1) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 @@ -4327,7 +4451,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4338,7 +4462,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4353,7 +4477,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4368,7 +4492,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 @@ -4383,7 +4507,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f32: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4405,38 +4529,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4454,8 +4580,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4473,8 +4599,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4492,8 +4618,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64 ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4517,38 +4643,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s0 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s0 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s0 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4566,8 +4694,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4585,8 +4713,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4604,8 +4732,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -4629,20 +4757,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4655,8 +4783,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlanex16_b32_vss_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4674,20 +4802,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4700,8 +4828,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f ; GFX12-LABEL: v_permlanex16_b32_vss_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4719,36 +4847,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4763,8 +4891,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4779,8 +4907,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4795,8 +4923,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4816,36 +4944,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4860,8 +4988,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4876,8 +5004,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -4892,8 +5020,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -4913,20 +5041,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4939,8 +5067,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlanex16_b32_vss_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4958,20 +5086,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX10-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4984,8 +5112,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f ; GFX12-LABEL: v_permlanex16_b32_vss_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5003,36 +5131,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5047,8 +5175,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5063,8 +5191,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5079,8 +5207,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5100,36 +5228,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5144,8 +5272,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5160,8 +5288,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5176,8 +5304,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5197,20 +5325,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5223,8 +5351,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5242,20 +5370,20 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s2 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5268,8 +5396,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_vss_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5287,36 +5415,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5331,8 +5459,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5347,8 +5475,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5363,8 +5491,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5384,36 +5512,36 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5428,8 +5556,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5444,8 +5572,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6 @@ -5460,8 +5588,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -5481,24 +5609,23 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5506,13 +5633,12 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5526,24 +5652,23 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX10-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5551,13 +5676,12 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i ; GFX12-LABEL: v_permlane16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5572,41 +5696,40 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX10-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -5614,15 +5737,14 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -5630,15 +5752,14 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -5646,15 +5767,14 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5670,12 +5790,12 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -5683,79 +5803,75 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_tid_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5771,24 +5887,23 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5796,13 +5911,12 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5817,24 +5931,23 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -5842,13 +5955,12 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -5864,39 +5976,38 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -5904,15 +6015,14 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -5920,15 +6030,14 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -5936,15 +6045,14 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -5961,12 +6069,12 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -5974,79 +6082,75 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_undef_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6063,23 +6167,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -6087,14 +6191,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6102,15 +6205,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6118,14 +6220,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6133,15 +6234,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6155,23 +6255,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -6179,14 +6279,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6194,15 +6293,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6210,14 +6308,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6225,15 +6322,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6248,43 +6344,43 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6292,15 +6388,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6308,15 +6404,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6324,15 +6420,15 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6348,14 +6444,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6363,85 +6459,81 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32 ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6457,24 +6549,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6482,13 +6573,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6503,24 +6593,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6528,13 +6617,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6550,39 +6638,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6590,15 +6677,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6606,15 +6692,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6622,15 +6707,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6647,12 +6731,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6660,79 +6744,75 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6749,24 +6829,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6774,13 +6853,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6795,24 +6873,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -6820,13 +6897,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlane16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -6842,39 +6918,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -6882,15 +6957,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -6898,15 +6972,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -6914,15 +6987,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -6939,12 +7011,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -6952,79 +7024,75 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7041,24 +7109,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7066,13 +7133,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7087,24 +7153,23 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX10-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7112,13 +7177,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou ; GFX12-LABEL: v_permlane16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7134,39 +7198,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX10-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7174,15 +7237,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7190,15 +7252,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7206,15 +7267,14 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7231,12 +7291,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7244,79 +7304,75 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7333,24 +7389,23 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7358,13 +7413,12 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7378,24 +7432,23 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7403,13 +7456,12 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_tid_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7424,41 +7476,40 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7466,15 +7517,14 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7482,15 +7532,14 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7498,15 +7547,14 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7522,12 +7570,12 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7535,79 +7583,75 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7623,24 +7667,23 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7648,13 +7691,12 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7669,24 +7711,23 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX10-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -7694,13 +7735,12 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out ; GFX12-LABEL: v_permlanex16_b32_undef_tid_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -7716,39 +7756,38 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX10-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7756,15 +7795,14 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7772,15 +7810,14 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7788,15 +7825,14 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7813,12 +7849,12 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -7826,79 +7862,75 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -7915,23 +7947,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -7939,14 +7971,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -7954,15 +7985,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -7970,14 +8000,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -7985,15 +8014,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8007,23 +8035,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dword v2, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm @@ -8031,14 +8059,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8046,15 +8073,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8062,14 +8088,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[2:3] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8077,15 +8102,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f32: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x449a5000 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8100,43 +8124,43 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8144,15 +8168,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8160,15 +8184,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8176,15 +8200,15 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8200,14 +8224,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 ; GFX10-SDAG-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8215,85 +8239,81 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3 ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x40934a00 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 ; GFX10-GISEL-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX12-SDAG-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s0, s1 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s0, s1 -; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v0, s2, s3 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v1, s2, s3 +; GFX12-GISEL-NEXT: global_store_b64 v4, v[2:3], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8309,24 +8329,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8334,13 +8353,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8355,24 +8373,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8380,13 +8397,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8402,39 +8418,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8442,15 +8457,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8458,15 +8472,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8474,15 +8487,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8499,12 +8511,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8512,79 +8524,75 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,0] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8601,24 +8609,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8626,13 +8633,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8647,24 +8653,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX10-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8672,13 +8677,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out, ; GFX12-LABEL: v_permlanex16_b32_i_tid_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8694,39 +8698,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -8734,15 +8737,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -8750,15 +8752,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -8766,15 +8767,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8791,12 +8791,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -8804,79 +8804,75 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out, ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[0,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -8893,24 +8889,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8918,13 +8913,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8939,24 +8933,23 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX10-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -8964,13 +8957,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o ; GFX12-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -8986,39 +8978,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX10-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -9026,15 +9017,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -9042,15 +9032,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm @@ -9058,15 +9047,14 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v2, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -9083,12 +9071,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX10-SDAG: ; %bb.0: ; GFX10-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-SDAG-NEXT: s_clause 0x1 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX10-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; @@ -9096,79 +9084,75 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x30 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX10-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1] -; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, s3 op_sel:[1,1] +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index a65143255bbb4e..973678291e2632 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -54,7 +54,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -72,7 +72,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -103,8 +103,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -116,9 +115,9 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -135,7 +134,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -148,7 +147,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -166,7 +165,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -179,7 +178,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -197,7 +196,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -210,7 +209,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -228,7 +227,7 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -241,7 +240,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -259,7 +258,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -272,7 +271,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -290,7 +289,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 @@ -303,7 +302,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -321,8 +320,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -334,9 +332,9 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -353,7 +351,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -366,7 +364,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -384,7 +382,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -397,7 +395,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -415,7 +413,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -428,7 +426,7 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -447,11 +445,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -462,10 +459,10 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3 ; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -483,11 +480,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -498,10 +494,10 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -520,11 +516,11 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -535,12 +531,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -558,11 +552,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -573,10 +566,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -595,11 +588,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -610,10 +602,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -632,11 +624,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -647,10 +638,10 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out ; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -669,11 +660,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -684,10 +674,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i ; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -705,11 +695,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -720,10 +709,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -742,11 +731,11 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1] @@ -757,12 +746,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32 ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -780,11 +767,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -795,10 +781,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -817,11 +803,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -832,10 +817,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out, ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -854,11 +839,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-SDAG: ; %bb.0: ; GFX12-SDAG-NEXT: s_clause 0x1 -; GFX12-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] @@ -869,10 +853,10 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou ; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc: ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: s_clause 0x1 -; GFX12-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x30 -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index abb2f877781879..f653baa7365c71 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -5,29 +5,114 @@ declare i32 @llvm.amdgcn.permlane64(i32) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) { -; GFX11-LABEL: test_s: +define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { +; GFX11-LABEL: test_s_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { -; GFX11-LABEL: test_i: +define amdgpu_kernel void @test_s_f32(ptr addrspace(1) %out, float %src0) { +; GFX11-LABEL: test_s_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float %src0) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { +; GFX11-SDAG-LABEL: test_s_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { +; GFX11-SDAG-LABEL: test_s_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_s_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double %src0) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { +; GFX11-LABEL: test_i_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 @@ -36,16 +121,16 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm - %v = call i32 @llvm.amdgcn.permlane64(i32 99) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 99) store i32 %v, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { -; GFX11-LABEL: test_v: +define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { +; GFX11-LABEL: test_i_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -53,11 +138,314 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm + %v = call float @llvm.amdgcn.permlane64.f32(float 1234.5) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 99) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: test_i_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_i_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %v = call double @llvm.amdgcn.permlane64.f64(double 1234.5) + store double %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { +; GFX11-SDAG-LABEL: test_v_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() - %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) + %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %tidx) store i32 %v, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-GISEL: {{.*}} -; GFX11-SDAG: {{.*}} + +define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %v = call float @llvm.amdgcn.permlane64.f32(float %tidx_f32) + store float %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 { +; GFX11-LABEL: test_v_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_permlane64_b32 v1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_i64 = zext i32 %tidx to i64 + %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %tidx_i64) + store i64 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { +; GFX11-SDAG-LABEL: test_v_f64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v_f64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidx_f32 = bitcast i32 %tidx to float + %tidx_f64 = fpext float %tidx_f32 to double + %v = call double @llvm.amdgcn.permlane64.f64(double %tidx_f64) + store double %v, ptr addrspace(1) %out + ret void +} + +define void @test_half(ptr addrspace(1) %out, half %src0) { +; GFX11-LABEL: test_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call half @llvm.amdgcn.permlane64.f16(half %src0) + store half %v, ptr addrspace(1) %out + ret void +} + +define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { +; GFX11-LABEL: test_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) + store bfloat %v, ptr addrspace(1) %out + ret void +} + +define void @test_i16(ptr addrspace(1) %out, i16 %src0) { +; GFX11-LABEL: test_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0) + store i16 %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) { +; GFX11-LABEL: test_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0) + store <2 x half> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) { +; GFX11-SDAG-LABEL: test_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0) + store <2 x float> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { +; GFX11-SDAG-LABEL: test_v7i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 +; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 +; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v7i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 +; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 +; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <7 x i32> @llvm.amdgcn.permlane64.v7i32(<7 x i32> %src0) + store <7 x i32> %v, ptr addrspace(1) %out + ret void +} + +define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) { +; GFX11-SDAG-LABEL: test_v8i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 +; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 +; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_v8i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 +; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 +; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 +; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0) + store <8 x i16> %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index afa3fe8c2f1fbd..2070a832e0fcd0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { ; GFX11-SDAG-LABEL: test_p0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 @@ -24,13 +24,13 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-LABEL: test_v3p0: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x2 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x44 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x54 -; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s6 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1 @@ -40,8 +40,8 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0 ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7 ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16 -; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[2:3] +; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -54,10 +54,10 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0 ; GFX11-SDAG-LABEL: test_p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -73,8 +73,8 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-LABEL: test_v3p3: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 @@ -97,10 +97,10 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0 ; GFX11-SDAG-LABEL: test_p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -116,8 +116,8 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-LABEL: test_v3p5: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 @@ -140,10 +140,10 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0 ; GFX11-SDAG-LABEL: test_p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] @@ -159,8 +159,8 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-LABEL: test_v3p6: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll index 7e16358f741819..36d23197887136 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -4,7 +4,7 @@ ; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target ; GCN-LABEL: {{^}}test: -; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN: .amdhsa_user_sgpr_queue_ptr 1 define amdgpu_kernel void @test(ptr addrspace(1) %out) { %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 @@ -13,21 +13,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out) { ret void } -; FIXME: Should really be able to delete the load -; GCN-LABEL: {{^}}test_ub: -; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 -; GCN: .amdhsa_user_sgpr_queue_ptr 0 -define amdgpu_kernel void @test_ub(ptr addrspace(1) %out) #1 { - %queue_ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 - %value = load i32, ptr addrspace(4) %queue_ptr - store i32 %value, ptr addrspace(1) %out - ret void -} - declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-no-queue-ptr" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index 419e19083f85e3..03f94d6e853f0b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -5,8 +5,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -34,8 +33,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB1_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -62,8 +60,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -90,8 +87,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -119,8 +115,7 @@ bb2: define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { ; CHECK-LABEL: raw_nonatomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc ; CHECK-NEXT: s_mov_b32 s0, 0 @@ -149,8 +144,8 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB5_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -179,8 +174,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -209,8 +203,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -243,8 +236,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -273,8 +265,7 @@ bb2: define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { ; CHECK-LABEL: raw_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll index 9f0b420a0a828d..5d9daae69e7865 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -5,11 +5,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen offset:24 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -21,11 +17,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sg ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -36,11 +28,7 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffse ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -51,11 +39,7 @@ define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__ ; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) @@ -66,11 +50,7 @@ define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_ ; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 6541ac9553231c..3228335073d077 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -5,8 +5,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -34,8 +33,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB1_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -62,8 +60,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB2_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -90,8 +87,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB3_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -119,8 +115,7 @@ bb2: define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc ; CHECK-NEXT: s_mov_b32 s0, 0 @@ -149,8 +144,8 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB5_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -179,8 +174,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB6_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -209,8 +203,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i16: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB7_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -243,8 +236,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB8_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -273,8 +265,7 @@ bb2: define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) { ; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: .LBB9_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 320b0b4508b6a5..9becefa33a8f24 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 @@ -26,7 +26,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) @@ -41,7 +41,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen offset:128 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 128 %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -56,7 +56,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll index ce46e2755ae582..9ac6b6a1d0ff9d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll @@ -8,29 +8,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen scc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -41,7 +33,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen scope:SCOPE_SYS ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) ret void @@ -51,29 +43,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; GFX908-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -84,7 +68,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) ret void @@ -94,29 +78,21 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -127,7 +103,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -137,29 +113,21 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX908-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -170,7 +138,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) ret void @@ -180,29 +148,21 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -213,7 +173,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll index 327d80a7b67cdc..fc4449886d9541 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll @@ -7,18 +7,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc scc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 sc1 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -29,7 +25,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24) @@ -40,18 +36,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s18 glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, off, s[4:7], s8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -62,7 +54,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0) @@ -73,18 +65,14 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s18 offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], s8 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -95,7 +83,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -106,18 +94,14 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s18 offset:92 glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, off, s[4:7], s8 offset:92 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -128,7 +112,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0) @@ -139,18 +123,14 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -161,7 +141,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll index 3ecbe3c71d0222..3c800d0369e70c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll @@ -9,11 +9,7 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21,33 +17,21 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -65,11 +49,7 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -78,33 +58,21 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -122,11 +90,7 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 @@ -137,33 +101,21 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -187,11 +139,7 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 @@ -206,33 +154,21 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) { ; GFX8-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s11, s17 -; GFX8-NEXT: s_mov_b32 s10, s16 -; GFX8-NEXT: s_mov_b32 s9, s7 -; GFX8-NEXT: s_mov_b32 s8, s6 -; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s11, s17 -; GFX9-NEXT: s_mov_b32 s10, s16 -; GFX9-NEXT: s_mov_b32 s9, s7 -; GFX9-NEXT: s_mov_b32 s8, s6 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: raw_ptr_buffer_load_v8bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index cc1547eaad8309..4d557c76dc4d07 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -1180,22 +1180,14 @@ define double @buffer_load_f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %v ; PREGFX10-LABEL: buffer_load_f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1214,22 +1206,14 @@ define <2 x double> @buffer_load_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc ; PREGFX10-LABEL: buffer_load_v2f64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2f64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1248,22 +1232,14 @@ define i64 @buffer_load_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voff ; PREGFX10-LABEL: buffer_load_i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1282,22 +1258,14 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, i ; PREGFX10-LABEL: buffer_load_v2i64__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2i64__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1316,22 +1284,14 @@ define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffs ; PREGFX10-LABEL: buffer_load_p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1350,22 +1310,14 @@ define <2 x ptr> @buffer_load_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, i3 ; PREGFX10-LABEL: buffer_load_v2p0__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p0__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1384,22 +1336,14 @@ define ptr addrspace(1) @buffer_load_p1__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,22 +1362,14 @@ define <2 x ptr addrspace(1)> @buffer_load_v2p1__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p1__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p1__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1452,22 +1388,14 @@ define ptr addrspace(4) @buffer_load_p4__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1486,22 +1414,14 @@ define <2 x ptr addrspace(4)> @buffer_load_v2p4__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p4__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p4__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1520,22 +1440,14 @@ define ptr addrspace(999) @buffer_load_p999__voffset_add(ptr addrspace(8) inreg ; PREGFX10-LABEL: buffer_load_p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1554,22 +1466,14 @@ define <2 x ptr addrspace(999)> @buffer_load_v2p999__voffset_add(ptr addrspace(8 ; PREGFX10-LABEL: buffer_load_v2p999__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p999__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1588,22 +1492,14 @@ define ptr addrspace(2) @buffer_load_p2__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1622,22 +1518,14 @@ define <2 x ptr addrspace(2)> @buffer_load_v2p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1656,11 +1544,7 @@ define <3 x ptr addrspace(2)> @buffer_load_v3p2__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1679,22 +1563,14 @@ define <4 x ptr addrspace(2)> @buffer_load_v4p2__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p2__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p2__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1713,22 +1589,14 @@ define ptr addrspace(3) @buffer_load_p3__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1747,22 +1615,14 @@ define <2 x ptr addrspace(3)> @buffer_load_v2p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1781,11 +1641,7 @@ define <3 x ptr addrspace(3)> @buffer_load_v3p3__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1804,22 +1660,14 @@ define <4 x ptr addrspace(3)> @buffer_load_v4p3__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p3__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p3__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1838,22 +1686,14 @@ define ptr addrspace(5) @buffer_load_p5__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1872,22 +1712,14 @@ define <2 x ptr addrspace(5)> @buffer_load_v2p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1906,11 +1738,7 @@ define <3 x ptr addrspace(5)> @buffer_load_v3p5__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1929,22 +1757,14 @@ define <4 x ptr addrspace(5)> @buffer_load_v4p5__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p5__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p5__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1963,22 +1783,14 @@ define ptr addrspace(6) @buffer_load_p6__voffset_add(ptr addrspace(8) inreg %rsr ; PREGFX10-LABEL: buffer_load_p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1997,22 +1809,14 @@ define <2 x ptr addrspace(6)> @buffer_load_v2p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v2p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v2p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2031,11 +1835,7 @@ define <3 x ptr addrspace(6)> @buffer_load_v3p6__voffset_add(ptr addrspace(8) in ; GFX10-LABEL: buffer_load_v3p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2054,22 +1854,14 @@ define <4 x ptr addrspace(6)> @buffer_load_v4p6__voffset_add(ptr addrspace(8) in ; PREGFX10-LABEL: buffer_load_v4p6__voffset_add: ; PREGFX10: ; %bb.0: ; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; PREGFX10-NEXT: s_mov_b32 s11, s17 -; PREGFX10-NEXT: s_mov_b32 s10, s16 -; PREGFX10-NEXT: s_mov_b32 s9, s7 -; PREGFX10-NEXT: s_mov_b32 s8, s6 -; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; PREGFX10-NEXT: s_waitcnt vmcnt(0) ; PREGFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_load_v4p6__voffset_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[8:11], 0 offen offset:60 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index d9227724c22a14..4fbb4ec342ff50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -352,22 +352,14 @@ define void @buffer_store_f64__voffset_add(ptr addrspace(8) inreg %rsrc, double ; VERDE-LABEL: buffer_store_f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -379,22 +371,14 @@ define void @buffer_store_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2f64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2f64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -406,22 +390,14 @@ define void @buffer_store_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i64 %da ; VERDE-LABEL: buffer_store_i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -433,22 +409,14 @@ define void @buffer_store_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2i64__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2i64__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -460,22 +428,14 @@ define void @buffer_store_p0__voffset_add(ptr addrspace(8) inreg %rsrc, ptr %dat ; VERDE-LABEL: buffer_store_p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -487,22 +447,14 @@ define void @buffer_store_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p0__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p0__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -514,22 +466,14 @@ define void @buffer_store_p1__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -541,22 +485,14 @@ define void @buffer_store_v2p1__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p1__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p1__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -568,22 +504,14 @@ define void @buffer_store_p4__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -595,22 +523,14 @@ define void @buffer_store_v2p4__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p4__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p4__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -622,22 +542,14 @@ define void @buffer_store_p999__voffset_add(ptr addrspace(8) inreg %rsrc, ptr ad ; VERDE-LABEL: buffer_store_p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -649,22 +561,14 @@ define void @buffer_store_v2p999__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x ; VERDE-LABEL: buffer_store_v2p999__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p999__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -676,22 +580,14 @@ define void @buffer_store_p2__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -703,22 +599,14 @@ define void @buffer_store_v2p2__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -730,22 +618,14 @@ define void @buffer_store_v3p2__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -757,22 +637,14 @@ define void @buffer_store_v4p2__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p2__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p2__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -784,22 +656,14 @@ define void @buffer_store_p3__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -811,22 +675,14 @@ define void @buffer_store_v2p3__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -838,22 +694,14 @@ define void @buffer_store_v3p3__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -865,22 +713,14 @@ define void @buffer_store_v4p3__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p3__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p3__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -892,22 +732,14 @@ define void @buffer_store_p5__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -919,22 +751,14 @@ define void @buffer_store_v2p5__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -946,22 +770,14 @@ define void @buffer_store_v3p5__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -973,22 +789,14 @@ define void @buffer_store_v4p5__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p5__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p5__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1000,22 +808,14 @@ define void @buffer_store_p6__voffset_add(ptr addrspace(8) inreg %rsrc, ptr addr ; VERDE-LABEL: buffer_store_p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1027,22 +827,14 @@ define void @buffer_store_v2p6__voffset_add(ptr addrspace(8) inreg %rsrc, <2 x p ; VERDE-LABEL: buffer_store_v2p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v2p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1054,22 +846,14 @@ define void @buffer_store_v3p6__voffset_add(ptr addrspace(8) inreg %rsrc, <3 x p ; VERDE-LABEL: buffer_store_v3p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v3p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx3 v[0:2], v3, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 @@ -1081,22 +865,14 @@ define void @buffer_store_v4p6__voffset_add(ptr addrspace(8) inreg %rsrc, <4 x p ; VERDE-LABEL: buffer_store_v4p6__voffset_add: ; VERDE: ; %bb.0: ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VERDE-NEXT: s_mov_b32 s11, s17 -; VERDE-NEXT: s_mov_b32 s10, s16 -; VERDE-NEXT: s_mov_b32 s9, s7 -; VERDE-NEXT: s_mov_b32 s8, s6 -; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: buffer_store_v4p6__voffset_add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[8:11], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[4:7], 0 offen offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 60 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll index 30f04f1ff220cb..cb511c93f67ed5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] @@ -17,28 +17,28 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -53,8 +53,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -65,28 +65,28 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -101,8 +101,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 @@ -115,32 +115,32 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -158,8 +158,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -174,30 +174,30 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index a241bdeaff1a75..01df7634f0e9c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -10,8 +10,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] @@ -19,28 +19,28 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -51,8 +51,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] @@ -67,8 +67,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff @@ -79,28 +79,28 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] @@ -111,8 +111,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b32 s4, s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] @@ -127,8 +127,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 @@ -141,32 +141,32 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: s_and_b32 s0, s3, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -179,8 +179,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -193,8 +193,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -213,8 +213,8 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) ; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16 ; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff @@ -229,30 +229,30 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED] ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -264,8 +264,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index b061d53de5d3c5..cc6c630ae6466d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -161,7 +161,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -187,7 +187,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -204,7 +204,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -215,7 +215,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -233,7 +233,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ; ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -265,7 +265,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -297,7 +297,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -311,7 +311,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -331,7 +331,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -345,7 +345,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readfirstlane_fi: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s15 +; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s9 ; CHECK-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-SDAG-NEXT: s_mov_b32 s4, 0 ; CHECK-SDAG-NEXT: ;;#ASMSTART @@ -375,7 +375,7 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { ; ; CHECK-GISEL-LABEL: test_readfirstlane_fi: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s15 +; CHECK-GISEL-NEXT: s_add_u32 s0, s0, s9 ; CHECK-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s4, 0 ; CHECK-GISEL-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 24a332fa211c15..66e1f9396de5af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s[0:1] @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] @@ -78,7 +78,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x4 +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x4 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s0, s[6:7], 0x4 +; CHECK-GISEL-NEXT: s_load_dword s0, s[4:5], 0x4 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -110,7 +110,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -124,7 +124,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-SDAG-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[6:7], 0x8 +; CHECK-GISEL-NEXT: s_load_dword s1, s[4:5], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -178,7 +178,7 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -188,7 +188,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -203,7 +203,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -231,7 +231,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -260,7 +260,7 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -309,7 +309,7 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -363,7 +363,7 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; CHECK-SDAG-LABEL: test_readlane_vregs_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -384,7 +384,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; ; CHECK-GISEL-LABEL: test_readlane_vregs_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -418,7 +418,7 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -431,7 +431,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -450,7 +450,7 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -464,7 +464,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -484,7 +484,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -500,7 +500,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -522,7 +522,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -538,7 +538,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -560,7 +560,7 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -573,7 +573,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -592,7 +592,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -606,7 +606,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -626,7 +626,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND @@ -640,7 +640,7 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index e2f494283a3f2e..f52461b6b38075 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -18,21 +18,21 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8DAGISEL-NEXT: s_endpgm ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -40,54 +40,54 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_nop 0 ; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -96,11 +96,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_nop 0 ; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -109,10 +109,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_nop 0 ; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -121,10 +121,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -138,7 +138,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -148,7 +148,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -158,7 +158,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -167,7 +167,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -176,7 +176,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -185,7 +185,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -194,7 +194,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -205,7 +205,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -216,7 +216,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -226,7 +226,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -280,7 +280,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -300,7 +300,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -320,7 +320,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -339,7 +339,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -358,7 +358,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -396,7 +396,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 @@ -415,7 +415,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -434,17 +434,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -458,16 +456,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -482,16 +478,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -505,16 +500,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -537,17 +530,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -562,8 +555,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -574,16 +567,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s4 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -596,8 +589,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -609,17 +602,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -634,8 +627,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -645,16 +638,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s4 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -667,8 +660,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -679,17 +672,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -704,8 +697,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -715,16 +708,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -737,8 +730,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -748,34 +741,34 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, 0 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032DAGISEL-NEXT: s_max_u32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_max_u32 s3, s3, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -784,54 +777,52 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -847,8 +838,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -858,20 +849,18 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -885,8 +874,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -897,38 +886,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, 0 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_max_u32 s3, s3, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -938,38 +925,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 +; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 5304188e02f84a..bfdb2da6dc6a41 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -19,21 +19,21 @@ declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8DAGISEL-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8DAGISEL-NEXT: s_endpgm ; ; GFX8GISEL-LABEL: uniform_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -41,54 +41,54 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; ; GFX9DAGISEL-LABEL: uniform_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9DAGISEL-NEXT: s_endpgm ; ; GFX9GISEL-LABEL: uniform_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9GISEL-NEXT: s_endpgm ; ; GFX10DAGISEL-LABEL: uniform_value: ; GFX10DAGISEL: ; %bb.0: ; %entry ; GFX10DAGISEL-NEXT: s_clause 0x1 -; GFX10DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10DAGISEL-NEXT: s_endpgm ; ; GFX10GISEL-LABEL: uniform_value: ; GFX10GISEL: ; %bb.0: ; %entry ; GFX10GISEL-NEXT: s_clause 0x1 -; GFX10GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: uniform_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry ; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1164DAGISEL-NEXT: s_nop 0 ; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -97,11 +97,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-LABEL: uniform_value: ; GFX1164GISEL: ; %bb.0: ; %entry ; GFX1164GISEL-NEXT: s_clause 0x1 -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1164GISEL-NEXT: s_nop 0 ; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -110,10 +110,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-LABEL: uniform_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1132DAGISEL-NEXT: s_nop 0 ; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -122,10 +122,10 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-LABEL: uniform_value: ; GFX1132GISEL: ; %bb.0: ; %entry ; GFX1132GISEL-NEXT: s_clause 0x1 -; GFX1132GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -139,7 +139,7 @@ entry: define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: const_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -149,7 +149,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: const_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -159,7 +159,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: const_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -168,7 +168,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: const_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -177,7 +177,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10DAGISEL-LABEL: const_value: ; GFX10DAGISEL: ; %bb.0: ; %entry -; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX10GISEL-LABEL: const_value: ; GFX10GISEL: ; %bb.0: ; %entry -; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -195,7 +195,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: const_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -206,7 +206,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: const_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -217,7 +217,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: const_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -227,7 +227,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: const_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] @@ -281,7 +281,7 @@ entry: define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -301,7 +301,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX8GISEL-LABEL: divergent_value: ; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, -1 ; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9DAGISEL-LABEL: divergent_value: ; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -340,7 +340,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX9GISEL-LABEL: divergent_value: ; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, -1 ; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -359,7 +359,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064DAGISEL-LABEL: divergent_value: ; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 @@ -378,7 +378,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1064GISEL-LABEL: divergent_value: ; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -397,7 +397,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032DAGISEL-LABEL: divergent_value: ; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1 @@ -416,7 +416,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1032GISEL-LABEL: divergent_value: ; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 @@ -435,17 +435,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164DAGISEL-LABEL: divergent_value: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -459,16 +457,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1164GISEL-LABEL: divergent_value: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -483,16 +479,15 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132DAGISEL-LABEL: divergent_value: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -506,16 +501,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; ; GFX1132GISEL-LABEL: divergent_value: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 @@ -538,17 +531,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -563,8 +556,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -575,16 +568,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL: ; %bb.0: ; %entry ; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s4 ; GFX8GISEL-NEXT: .LBB4_2: ; %Flow -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -597,8 +590,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8GISEL-NEXT: .LBB4_5: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -610,17 +603,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL: ; %bb.0: ; %entry ; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -635,8 +628,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -646,16 +639,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL: ; %bb.0: ; %entry ; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX9GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s4 ; GFX9GISEL-NEXT: .LBB4_2: ; %Flow -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -668,8 +661,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9GISEL-NEXT: .LBB4_5: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -680,17 +673,17 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL: ; %bb.0: ; %entry ; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -705,8 +698,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -716,16 +709,16 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL: ; %bb.0: ; %entry ; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX1064GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -738,8 +731,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064GISEL-NEXT: .LBB4_5: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -749,34 +742,34 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-LABEL: divergent_cfg: ; GFX1032DAGISEL: ; %bb.0: ; %entry ; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dword s3, s[0:1], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, -1 ; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032DAGISEL-NEXT: s_min_u32 s1, s1, s6 +; GFX1032DAGISEL-NEXT: s_min_u32 s3, s3, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] @@ -785,54 +778,52 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-LABEL: divergent_cfg: ; GFX1032GISEL: ; %bb.0: ; %entry ; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX1032GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1032GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s3, s3 ; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1032GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s5, s4 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s6 +; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032GISEL-NEXT: .LBB4_5: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1032GISEL-NEXT: s_endpgm ; ; GFX1164DAGISEL-LABEL: divergent_cfg: ; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr4 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] ; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec @@ -848,8 +839,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -859,20 +850,18 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1164GISEL-LABEL: divergent_cfg: ; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s4 ; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] ; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -886,8 +875,8 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164GISEL-NEXT: .LBB4_5: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -898,38 +887,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132DAGISEL-LABEL: divergent_cfg: ; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b32 s3, s[0:1], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 ; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, -1 ; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6 +; GFX1132DAGISEL-NEXT: s_min_u32 s3, s3, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -939,38 +926,36 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; ; GFX1132GISEL-LABEL: divergent_cfg: ; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr0 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 +; GFX1132GISEL-NEXT: s_mov_b32 s2, s2 ; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s3, s3 ; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 +; GFX1132GISEL-NEXT: s_mov_b32 s2, -1 ; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s5, s4 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1132GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6 +; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132GISEL-NEXT: .LBB4_5: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1132GISEL-NEXT: s_nop 0 ; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index d521a6c25e462e..47c021769aa56f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -10,104 +10,102 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT0-LABEL: test_barrier: ; VARIANT0: ; %bb.0: ; %entry -; VARIANT0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT0-NEXT: s_load_dword s4, s[2:3], 0xb -; VARIANT0-NEXT: s_mov_b32 s3, 0xf000 -; VARIANT0-NEXT: s_mov_b32 s2, 0 +; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb +; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 +; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 -; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) -; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_endpgm ; ; VARIANT1-LABEL: test_barrier: ; VARIANT1: ; %bb.0: ; %entry -; VARIANT1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; VARIANT1-NEXT: s_load_dword s4, s[2:3], 0xb -; VARIANT1-NEXT: s_mov_b32 s3, 0xf000 -; VARIANT1-NEXT: s_mov_b32 s2, 0 +; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb +; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 +; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) -; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_waitcnt vmcnt(0) -; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_endpgm ; ; VARIANT2-LABEL: test_barrier: ; VARIANT2: ; %bb.0: ; %entry -; VARIANT2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT2-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT2-NEXT: v_mov_b32_e32 v3, s1 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier ; VARIANT2-NEXT: global_load_dword v0, v[0:1], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry -; VARIANT3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VARIANT3-NEXT: s_load_dword s4, s[2:3], 0x2c +; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VARIANT3-NEXT: v_mov_b32_e32 v3, s1 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT3-NEXT: s_barrier ; VARIANT3-NEXT: global_load_dword v0, v[0:1], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v2, v0, s[0:1] +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: s_endpgm ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; VARIANT4-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v2 +; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT4-NEXT: s_wait_kmcnt 0x0 -; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2 -; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1] +; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 -; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT4-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT4-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT4-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; VARIANT4-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT4-NEXT: s_wait_loadcnt 0x0 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT4-NEXT: s_nop 0 @@ -116,22 +114,20 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; VARIANT5-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v2 +; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT5-NEXT: s_wait_kmcnt 0x0 -; VARIANT5-NEXT: v_xad_u32 v0, v2, -1, s2 -; VARIANT5-NEXT: global_store_b32 v3, v2, s[0:1] +; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 +; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_barrier_signal -1 ; VARIANT5-NEXT: s_barrier_wait -1 -; VARIANT5-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT5-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT5-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT5-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; VARIANT5-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT5-NEXT: s_wait_loadcnt 0x0 ; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] ; VARIANT5-NEXT: s_nop 0 @@ -140,24 +136,23 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 -; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 ; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 -; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) -; VARIANT6-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4 -; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 -; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1] -; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 +; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 +; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; VARIANT6-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; VARIANT6-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; VARIANT6-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] +; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT6-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo -; VARIANT6-NEXT: global_load_b32 v0, v[0:1], off +; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT6-NEXT: s_wait_loadcnt 0x0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] ; VARIANT6-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 8bfe996c6a90a3..38a34ec6daf73c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -5,11 +5,10 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -23,11 +22,10 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -53,11 +51,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -71,11 +68,10 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -101,11 +97,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -119,11 +114,10 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -149,12 +143,12 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_mov_b32 m0, 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] @@ -168,13 +162,11 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v2, 0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -230,10 +222,8 @@ define void @test2_s_barrier_signal_var(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -252,10 +242,8 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -290,10 +278,8 @@ entry: define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -312,10 +298,8 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -350,10 +334,8 @@ entry: define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_signal_isfirst: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -372,10 +354,8 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -410,11 +390,9 @@ entry: define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_signal_isfirst_var: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_mov_b32 m0, 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_wait_storecnt 0x0 @@ -433,11 +411,9 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 @@ -542,11 +518,10 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test1_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -560,11 +535,10 @@ define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -588,11 +562,10 @@ entry: define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test2_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -606,11 +579,10 @@ define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -634,11 +606,10 @@ entry: define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test3_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 @@ -652,11 +623,10 @@ define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrC ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 @@ -680,17 +650,15 @@ entry: define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { ; GCN-LABEL: test4_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] ; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_barrier_init m0 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -700,11 +668,10 @@ define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 @@ -765,12 +732,11 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) { define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -780,11 +746,10 @@ define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -807,12 +772,11 @@ entry: define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -822,11 +786,10 @@ define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -849,12 +812,11 @@ entry: define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_barrier_join: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_barrier_join 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_barrier_join 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -864,11 +826,10 @@ define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_join: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -891,11 +852,11 @@ entry: define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_barrier_join_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -908,11 +869,10 @@ define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %b ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -964,10 +924,8 @@ define void @test5_s_barrier_join_m0(i32 %arg) { define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_barrier_leave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] ; GCN-NEXT: s_barrier_leave @@ -985,16 +943,14 @@ define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrsp ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_leave: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] ; GLOBAL-ISEL-NEXT: s_barrier_leave ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0 ; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GLOBAL-ISEL-NEXT: s_clause 0x1 @@ -1022,12 +978,11 @@ entry: define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier -1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier -1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1037,11 +992,10 @@ define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1064,12 +1018,11 @@ entry: define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1079,11 +1032,10 @@ define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1106,12 +1058,11 @@ entry: define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_wakeup_barrier: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_wakeup_barrier 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GCN-NEXT: s_wakeup_barrier 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v2, v0, s[0:1] @@ -1121,11 +1072,10 @@ define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1148,11 +1098,11 @@ entry: define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_wakeup_barrier_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 @@ -1165,11 +1115,10 @@ define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 ; ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 @@ -1221,12 +1170,11 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) { define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test1_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, -1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, -1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1234,14 +1182,13 @@ define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1259,12 +1206,11 @@ entry: define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test2_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 1 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, 1 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1272,14 +1218,13 @@ define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1297,12 +1242,11 @@ entry: define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test3_s_get_barrier_state: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_get_barrier_state s4, 0 -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_get_barrier_state s2, 0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1310,14 +1254,13 @@ define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] ; GLOBAL-ISEL-NEXT: s_nop 0 @@ -1335,10 +1278,8 @@ entry: define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_get_barrier_state_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1353,10 +1294,8 @@ define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i ; ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1413,11 +1352,10 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) { define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GCN-LABEL: test_barrier_convert: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 +; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] @@ -1431,11 +1369,10 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; ; GLOBAL-ISEL-LABEL: test_barrier_convert: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GLOBAL-ISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 +; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 +; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll index bc7052132a87b0..4a404af54188d6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll @@ -37,7 +37,7 @@ define void @test_s_sleep_var2() { define amdgpu_kernel void @test_s_sleep_var3(i32 %arg) { ; GCN-LABEL: test_s_sleep_var3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GCN-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_sleep_var s0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 527627a5a2f67d..c2e74eb05d1645 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -5,11 +5,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -73,11 +72,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v40, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 @@ -177,11 +175,10 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 @@ -259,11 +256,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x7fe0, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index a29e2298210a3a..fdcb1773d0a3f4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -7,12 +7,11 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16( define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GCN-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; GCN-NEXT: ds_load_b128 v[8:11], v0 @@ -59,12 +58,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_and_b32_e32 v28, 0x3ff0, v0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 @@ -149,131 +147,127 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GCN-NEXT: v_lshl_add_u32 v17, v16, 5, s0 -; GCN-NEXT: v_lshl_add_u32 v16, v16, 4, s1 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:1024 -; GCN-NEXT: ds_load_b128 v[0:3], v17 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 +; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; GCN-NEXT: ds_load_b128 v[1:4], v17 +; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x2 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:2560 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: ds_store_b128 v0, v[13:16] +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:512 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:4608 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1024 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:7168 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:1536 -; GCN-NEXT: ds_load_b128 v[8:11], v17 offset:10240 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; GCN-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2048 +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v16, 5, s0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v16, v16, 4, s1 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:2560 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:512 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:4608 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1024 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:7168 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:1536 -; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v17 offset:10240 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v9 :: v_dual_mov_b32 v12, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[8:11], v[0:7], v18 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2048 +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 24b8a3c2dc8730..10f09b6390abae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,8 +29,7 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -97,8 +96,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -180,27 +178,34 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 +; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 +; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 -; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 +; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 +; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -208,33 +213,24 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 +; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_lo_u32 v19, v19, v19 ; GCN-NEXT: v_mul_lo_u32 v18, v18, v18 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v17 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v23, v23, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -245,11 +241,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v22, v22, v22 ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 +; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 ; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 ; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 ; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] @@ -261,27 +258,34 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -289,33 +293,24 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(2) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) @@ -326,11 +321,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] @@ -385,23 +381,23 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -409,7 +405,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -423,17 +418,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 -; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 +; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -442,7 +430,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -450,47 +439,53 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -498,7 +493,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 @@ -512,17 +506,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 -; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 +; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -531,7 +518,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 ; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) @@ -539,25 +527,31 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -620,9 +614,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v1, s0, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -727,9 +720,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 @@ -870,9 +862,8 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1004,9 +995,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v1, 0x1ff80, v0 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1198,9 +1188,9 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out, <5 x float> %in1) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GCN-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1212,8 +1202,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_add_f32_e32 v4, v6, v4 ; GCN-NEXT: v_exp_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1288,7 +1277,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_mul_f32_e32 v4, s7, v3 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GCN-NEXT: v_rndne_f32_e32 v10, v4 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x54 +; GCN-NEXT: s_load_dword s8, s[0:1], 0x54 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; GCN-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1324,7 +1313,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1335,8 +1324,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: ; kill: killed $sgpr2_sgpr3 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) @@ -1383,9 +1372,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_interleave_EXP_MFMA: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x44 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 0x3fb8aa3b -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v7, 0x32a5705f ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,8 +1386,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_add_f32_e32 v4, v6, v4 ; EXACTCUTOFF-NEXT: v_exp_f32_e32 v4, v4 ; EXACTCUTOFF-NEXT: v_cvt_i32_f32_e32 v5, v5 -; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x1ff80, v0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -1473,7 +1461,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_mul_f32_e32 v4, s7, v3 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; EXACTCUTOFF-NEXT: v_rndne_f32_e32 v10, v4 -; EXACTCUTOFF-NEXT: s_load_dword s8, s[2:3], 0x54 +; EXACTCUTOFF-NEXT: s_load_dword s8, s[0:1], 0x54 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v9, v1, a[64:95] ; EXACTCUTOFF-NEXT: v_sub_f32_e32 v1, v4, v10 @@ -1509,7 +1497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; EXACTCUTOFF-NEXT: v_cmp_ngt_f32_e32 vcc, s8, v6 ; EXACTCUTOFF-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v9, v1, a[128:159] @@ -1520,8 +1508,8 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_interleave_EXP_MFMA ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr2_sgpr3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000400) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index 363c54d4abe908..eb30484ea7f19e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_doorbell: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -16,7 +16,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_doorbell: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -32,7 +32,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_ddid: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -43,7 +43,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_ddid: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tma: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_realtime: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -93,7 +93,7 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_savewave: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -104,7 +104,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_savewave: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_tba: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -137,7 +137,7 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_get_0_i32: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -148,7 +148,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; ; GFX11-GISEL-LABEL: test_get_0_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { ; GFX11-LABEL: test_get_99999_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 114d2d099ab7b1..fc33206845a713 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -24,7 +24,7 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 1 @@ -39,7 +39,7 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -61,7 +61,7 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-LABEL: set_inactive_imm_poison_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -77,17 +77,17 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 56 +; GCN-NEXT: s_cmp_lg_u32 s3, 56 ; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc1 .LBB4_3 ; GCN-NEXT: ; %bb.1: ; %Flow @@ -127,8 +127,8 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x40400000 @@ -147,7 +147,7 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -171,8 +171,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x10001 @@ -191,8 +191,8 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3c003c00 @@ -211,7 +211,7 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -235,7 +235,7 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1.0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -259,8 +259,8 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s5, 0x3f803f80 @@ -279,7 +279,7 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x10001 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -303,7 +303,7 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -327,7 +327,7 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -351,7 +351,7 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -373,8 +373,8 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -392,8 +392,8 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -411,8 +411,8 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,8 +430,8 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll index c1f1782ea5a87f..5401de0b082883 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -5,11 +5,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen offset:24 +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 @@ -22,11 +18,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset_ ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -37,11 +29,7 @@ define void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffs ; CHECK-LABEL: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc +; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen slc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -52,11 +40,7 @@ define void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vof ; CHECK-LABEL: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s11, s17 -; CHECK-NEXT: s_mov_b32 s10, s16 -; CHECK-NEXT: s_mov_b32 s9, s7 -; CHECK-NEXT: s_mov_b32 s8, s6 -; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen offset:24 +; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[4:7], s8 idxen offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 78204dfefc80cc..e0e4f950cc16c2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -10,7 +10,7 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -25,7 +25,7 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgp ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index 10059960030446..864244b6cebcf9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -8,11 +8,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -21,11 +17,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -34,7 +26,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -45,7 +37,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -56,29 +48,21 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -89,7 +73,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -99,11 +83,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX908-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -112,11 +92,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -125,7 +101,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -136,7 +112,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -146,11 +122,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX908-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s11, s17 -; GFX908-NEXT: s_mov_b32 s10, s16 -; GFX908-NEXT: s_mov_b32 s9, s7 -; GFX908-NEXT: s_mov_b32 s8, s6 -; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s18 idxen offen +; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[4:7], s8 idxen offen ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; @@ -159,11 +131,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[4:7], s8 idxen offen ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -172,7 +140,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s4 idxen offen ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -183,7 +151,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index 5f6a67e4660209..ba6005e004efc4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -9,11 +9,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -22,7 +18,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -33,7 +29,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -45,18 +41,14 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen glc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], s8 idxen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen sc0 +; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -67,7 +59,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -80,11 +72,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc slc +; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[4:7], s8 idxen offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -93,7 +81,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 nt +; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s4 idxen offen sc0 nt ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -104,7 +92,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -117,11 +105,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s9, s7 -; GFX90A-NEXT: s_mov_b32 s8, s6 -; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen glc +; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[4:7], s8 idxen offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -130,7 +114,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen sc0 +; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s4 idxen offen sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; @@ -141,7 +125,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__ ; GFX1200-NEXT: s_wait_samplecnt 0x0 ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX1200-NEXT: s_wait_loadcnt 0x0 ; GFX1200-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index bd803c380e90a5..1fb5d53d5fd826 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -10,40 +10,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -54,7 +42,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -65,40 +53,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -109,7 +85,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -121,40 +97,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen glc +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen glc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +129,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -176,40 +140,28 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -220,7 +172,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -231,39 +183,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -273,7 +213,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -283,39 +223,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset: @@ -325,7 +253,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -337,39 +265,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], s18 idxen +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], s8 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s6 idxen +; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s4 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -379,7 +295,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -389,39 +305,27 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX10-NEXT: buffer_atomic_fmax v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s6 idxen offen slc +; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s4 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -431,7 +335,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -449,14 +353,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX6-NEXT: v_readfirstlane_b32 s10, v3 ; GFX6-NEXT: v_readfirstlane_b32 s11, v4 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -473,14 +377,14 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX7-NEXT: v_readfirstlane_b32 s10, v3 ; GFX7-NEXT: v_readfirstlane_b32 s11, v4 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] @@ -490,25 +394,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX10-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v1 ; GFX10-NEXT: v_readfirstlane_b32 s9, v2 ; GFX10-NEXT: v_readfirstlane_b32 s10, v3 ; GFX10-NEXT: v_readfirstlane_b32 s11, v4 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4] +; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_and_saveexec_b32 s5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmax v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll index 4f9bac584a78e4..b859147b6dc6b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64.ll @@ -9,22 +9,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -35,22 +27,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -62,22 +46,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -88,22 +64,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -114,22 +82,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -140,22 +100,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmax__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -168,22 +120,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -194,22 +138,14 @@ define void @struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmax_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -228,14 +164,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: v_readfirstlane_b32 s10, v4 ; GFX6-NEXT: v_readfirstlane_b32 s11, v5 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -252,14 +188,14 @@ define double @struct_ptr_buffer_atomic_fmax_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: v_readfirstlane_b32 s10, v4 ; GFX7-NEXT: v_readfirstlane_b32 s11, v5 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index c9b50eddc94eef..87055db9a58f09 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -10,40 +10,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -54,7 +42,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -65,40 +53,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -109,7 +85,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -121,40 +97,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen glc +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen glc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +129,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -176,40 +140,28 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen glc slc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen glc slc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen glc slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen glc slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -220,7 +172,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -231,39 +183,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: @@ -273,7 +213,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -283,39 +223,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen offset:256 +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen offset:256 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: @@ -325,7 +253,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen offset:256 ; GFX12-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) @@ -337,39 +265,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[8:11], s18 idxen +; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], s8 idxen ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s6 idxen +; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s4 idxen ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: @@ -379,7 +295,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 idxen ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void @@ -389,39 +305,27 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s11, s17 -; GFX10-NEXT: s_mov_b32 s10, s16 -; GFX10-NEXT: s_mov_b32 s9, s7 -; GFX10-NEXT: s_mov_b32 s8, s6 -; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[8:11], s18 idxen offen slc +; GFX10-NEXT: buffer_atomic_fmin v0, v[1:2], s[4:7], s8 idxen offen slc ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s6 idxen offen slc +; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s4 idxen offen slc ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: @@ -431,7 +335,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_NT ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void @@ -449,14 +353,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX6-NEXT: v_readfirstlane_b32 s10, v3 ; GFX6-NEXT: v_readfirstlane_b32 s11, v4 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX6-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -473,14 +377,14 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX7-NEXT: v_readfirstlane_b32 s10, v3 ; GFX7-NEXT: v_readfirstlane_b32 s11, v4 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[3:4] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[3:4] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX7-NEXT: ; implicit-def: $vgpr5_vgpr6 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] @@ -490,25 +394,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX10-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, exec_lo +; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_readfirstlane_b32 s8, v1 ; GFX10-NEXT: v_readfirstlane_b32 s9, v2 ; GFX10-NEXT: v_readfirstlane_b32 s10, v3 ; GFX10-NEXT: v_readfirstlane_b32 s11, v4 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[1:2] -; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[3:4] -; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_and_saveexec_b32 s4, s4 +; GFX10-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[3:4] +; GFX10-NEXT: s_and_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_and_saveexec_b32 s5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s6 idxen offen offset:256 glc +; GFX10-NEXT: buffer_atomic_fmin v0, v[5:6], s[8:11], s4 idxen offen offset:256 glc ; GFX10-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX10-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll index 01bc833d59be79..5c23a86dab33ab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64.ll @@ -9,22 +9,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -35,22 +27,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -62,22 +46,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -88,22 +64,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen glc slc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen glc slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -114,22 +82,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -140,22 +100,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen offset:256 +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen offset:256 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %voffset.add = add i32 %voffset, 256 @@ -168,22 +120,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_vof ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], s18 idxen +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], s8 idxen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) @@ -194,22 +138,14 @@ define void @struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_ ; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s11, s17 -; GFX6-NEXT: s_mov_b32 s10, s16 -; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: s_mov_b32 s8, s6 -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: struct_ptr_buffer_atomic_fmin_f64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s11, s17 -; GFX7-NEXT: s_mov_b32 s10, s16 -; GFX7-NEXT: s_mov_b32 s9, s7 -; GFX7-NEXT: s_mov_b32 s8, s6 -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[8:11], s18 idxen offen slc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[2:3], s[4:7], s8 idxen offen slc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) @@ -228,14 +164,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX6-NEXT: v_readfirstlane_b32 s10, v4 ; GFX6-NEXT: v_readfirstlane_b32 s11, v5 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX6-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX6-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB8_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[12:13] @@ -252,14 +188,14 @@ define double @struct_ptr_buffer_atomic_fmin_f64_ret__vgpr_val__vgpr_rsrc__vgpr_ ; GFX7-NEXT: v_readfirstlane_b32 s10, v4 ; GFX7-NEXT: v_readfirstlane_b32 s11, v5 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] -; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] -; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], v[4:5] +; GFX7-NEXT: s_and_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s6 idxen offen offset:256 glc +; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v[6:7], s[8:11], s4 idxen offen offset:256 glc ; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB8_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[12:13] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll index 38fdcf47171aff..439742d6b315d3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -8,40 +8,40 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -57,43 +57,43 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -109,29 +109,29 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -139,13 +139,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -153,9 +153,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -174,30 +174,30 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s7, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s7, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s8 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s7 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -205,12 +205,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -218,9 +218,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 1da076c6523990..22ec22dc2db024 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -11,40 +11,40 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_x: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_x: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -56,8 +56,8 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX12-PACKED-LABEL: tbuffer_store_d16_x: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -73,43 +73,43 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s4, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s4 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm ; ; GFX10-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x1 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; ; GFX11-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x1 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -121,8 +121,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat ; GFX12-PACKED-LABEL: tbuffer_store_d16_xy: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -138,29 +138,29 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -168,13 +168,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: s_and_b32 s5, s5, 0xffff -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -182,9 +182,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 @@ -198,8 +198,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body ; GFX12-PACKED-SDAG-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 +; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff ; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -213,8 +213,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da ; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz: ; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body ; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 +; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4 ; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -233,30 +233,30 @@ main_body: define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { ; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-UNPACKED: ; %bb.0: ; %main_body -; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; PREGFX10-UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[4:5], 0x18 ; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff -; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s4, 16 -; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s7, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s7, s7, 0xffff +; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s8, s6, 16 +; PREGFX10-UNPACKED-NEXT: s_and_b32 s6, s6, 0xffff +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s6 ; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s8 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s7 -; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s6 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s7 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s5 +; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v4, s4 ; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-UNPACKED-NEXT: s_endpgm ; ; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; PREGFX10-PACKED: ; %bb.0: ; %main_body -; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; PREGFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PREGFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; PREGFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; PREGFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; PREGFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; PREGFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; PREGFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PREGFX10-PACKED-NEXT: s_endpgm @@ -264,12 +264,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX10-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX10-PACKED: ; %bb.0: ; %main_body ; GFX10-PACKED-NEXT: s_clause 0x2 -; GFX10-PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX10-PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-PACKED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX10-PACKED-NEXT: s_load_dword s8, s[4:5], 0x18 +; GFX10-PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-PACKED-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-PACKED-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-PACKED-NEXT: v_mov_b32_e32 v2, s8 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm @@ -277,9 +277,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX11-PACKED: ; %bb.0: ; %main_body ; GFX11-PACKED-NEXT: s_clause 0x2 -; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[2:3], 0x10 -; GFX11-PACKED-NEXT: s_load_b32 s6, s[2:3], 0x18 -; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[0:1], 0x18 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 @@ -292,8 +292,8 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d ; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw: ; GFX12-PACKED: ; %bb.0: ; %main_body ; GFX12-PACKED-NEXT: s_clause 0x1 -; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[2:3], 0x10 -; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX12-PACKED-NEXT: s_load_b96 s[4:6], s[0:1], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 279a64adfbda15..0755dcddd8f46e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -50,7 +50,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -69,7 +69,7 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b @@ -83,7 +83,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -102,7 +102,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s8, 0x7b @@ -117,7 +117,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s8, 0x7b ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -137,7 +137,7 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_reg_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_reg_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -162,7 +162,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_0_width_imm_offset: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -172,7 +172,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, ; ; VI-LABEL: bfe_u32_arg_0_width_imm_offset: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +187,7 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zextload_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -204,7 +204,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: bfe_u32_zextload_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -229,7 +229,7 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -248,7 +248,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: bfe_u32_zext_in_reg_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -275,7 +275,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -294,7 +294,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: bfe_u32_zext_in_reg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -341,7 +341,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -369,7 +369,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -389,7 +389,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out ; ; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -465,7 +465,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -484,7 +484,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou ; ; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -511,7 +511,7 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -529,7 +529,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -553,7 +553,7 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -563,7 +563,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -580,7 +580,7 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -590,7 +590,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -607,7 +607,7 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,7 +617,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -635,7 +635,7 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -653,7 +653,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -679,7 +679,7 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -698,7 +698,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -724,7 +724,7 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -742,7 +742,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -767,7 +767,7 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -785,7 +785,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -810,7 +810,7 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -828,7 +828,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: bfe_u32_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -852,7 +852,7 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -870,7 +870,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -894,7 +894,7 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -912,7 +912,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -936,7 +936,7 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -954,7 +954,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -979,7 +979,7 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -997,7 +997,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1021,7 +1021,7 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: bfe_u32_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1031,7 +1031,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: bfe_u32_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1057,7 +1057,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1122,7 +1122,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_3: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1132,7 +1132,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_3: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, -1 @@ -1157,7 +1157,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, -1 @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_5: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_5: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1197,7 +1197,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1207,7 +1207,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x80 @@ -1222,7 +1222,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_7: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1232,7 +1232,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_7: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1247,7 +1247,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1272,7 +1272,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_9: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # ; ; VI-LABEL: bfe_u32_constant_fold_test_9: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) # define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_10: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_10: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_11: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1332,7 +1332,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_11: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_12: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,7 +1357,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_12: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1372,7 +1372,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_13: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 @@ -1382,7 +1382,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_13: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_14: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 40 @@ -1407,7 +1407,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_14: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 40 @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_15: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 10 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_15: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 10 @@ -1447,7 +1447,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1472,7 +1472,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_17: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_17: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 { ; SI-LABEL: bfe_u32_constant_fold_test_18: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) ; ; VI-LABEL: bfe_u32_constant_fold_test_18: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1526,45 +1526,47 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 63, v0 ; SI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 { @@ -1579,11 +1581,11 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0 define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1591,8 +1593,8 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1609,7 +1611,7 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s2, s3 @@ -1623,7 +1625,7 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,11 +1645,11 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1655,8 +1657,8 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1673,11 +1675,11 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: and_lshr2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x30006 +; SI-NEXT: s_bfe_u32 s4, s2, 0x30006 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1685,8 +1687,8 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: and_lshr2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1703,11 +1705,11 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 { define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; SI-LABEL: shl_lshr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s4, 0x150002 +; SI-NEXT: s_bfe_u32 s4, s2, 0x150002 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1715,8 +1717,8 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 { ; ; VI-LABEL: shl_lshr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll index abce1f6cd8f84a..ab29ca4a997348 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll @@ -1,8 +1,7 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o %t.bc -; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %t.bc | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.bc | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,UNKNOWN-OS %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D %s declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll index eaee8ec73fe411..47f988fc17d281 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -1,10 +1,9 @@ -; RUN: opt -mtriple=amdgcn-- -passes=amdgpu-attributor -o %t.bc %s -; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %t.bc | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %t.bc | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii < %t.bc | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global < %t.bc | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.bc | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.bc | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 9d93ca65683c42..31f1085dd76ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -14,7 +14,7 @@ declare double @llvm.amdgcn.writelane.f64(double, i32, double) #0 define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -40,7 +40,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -54,7 +54,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -68,7 +68,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1010-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s4, s[0:1], 0x0 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 @@ -100,8 +100,8 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -118,24 +118,24 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -151,8 +151,8 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; ; GFX802-GISEL-LABEL: test_writelane_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -169,24 +169,24 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX1010-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -208,8 +208,8 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -226,24 +226,24 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -259,8 +259,8 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; ; GFX802-GISEL-LABEL: test_writelane_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s6, s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -277,24 +277,24 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX1010-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -316,8 +316,8 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -331,8 +331,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -345,23 +345,23 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s2 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s0 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -375,8 +375,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -389,15 +389,15 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s2 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s0 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -410,8 +410,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -427,41 +427,41 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, 0, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 32, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX1100-SDAG-NEXT: v_writelane_b32 v1, 0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 32, s4 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -477,33 +477,33 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 32, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, 0, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 32, s4 ; GFX1100-GISEL-NEXT: v_writelane_b32 v1, 0, s4 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -516,8 +516,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -535,8 +535,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -544,35 +544,35 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-SDAG-NEXT: s_mov_b32 s2, 0x40400000 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s2, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, 0, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40400000 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-SDAG-NEXT: s_mov_b32 s0, 0x40400000 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s0, s4 ; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s4, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -590,8 +590,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -599,26 +599,26 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX1010-GISEL-NEXT: s_mov_b32 s2, 0x40400000 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, 0, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s2, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_sreg_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1100-GISEL-NEXT: s_mov_b32 s2, 0x40400000 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX1100-GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GFX1100-GISEL-NEXT: v_writelane_b32 v0, 0, s4 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s2, s4 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s0, s4 +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -631,7 +631,7 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -654,7 +654,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -671,9 +671,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 @@ -692,7 +690,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -716,7 +714,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dword v0, v0, s[2:3] offset:4 @@ -733,9 +731,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 @@ -764,7 +760,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -789,7 +785,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -808,11 +804,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -831,7 +825,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -856,7 +850,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -874,9 +868,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 @@ -907,7 +899,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { ; GFX802-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -934,7 +926,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -954,11 +946,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-SDAG-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_b32 v0, v0, s[2:3] offset:8 ; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -978,7 +968,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX802-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,7 +995,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:8 @@ -1024,9 +1014,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; ; GFX1100-GISEL-LABEL: test_writelane_vreg_lane_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:8 @@ -1059,8 +1047,8 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND @@ -1079,8 +1067,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: ;;#ASMSTART ; GFX1010-SDAG-NEXT: s_mov_b32 m0, -1 @@ -1096,26 +1084,26 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-SDAG-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: ;;#ASMSTART ; GFX1100-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX1100-SDAG-NEXT: ;;#ASMEND ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s2 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, m0, s0 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND @@ -1134,8 +1122,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: ;;#ASMSTART ; GFX1010-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1010-GISEL-NEXT: ;;#ASMEND @@ -1151,18 +1139,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX1100-GISEL-LABEL: test_writelane_m0_sreg_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: ;;#ASMSTART ; GFX1100-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX1100-GISEL-NEXT: ;;#ASMEND ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s2 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, m0, s0 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -1176,8 +1164,8 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1191,8 +1179,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-SDAG-LABEL: test_writelane_imm_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1205,23 +1193,23 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-SDAG-LABEL: test_writelane_imm_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, 32 -; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, 32 +; GFX1100-SDAG-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX802-GISEL-LABEL: test_writelane_imm_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1235,8 +1223,8 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1010-GISEL-LABEL: test_writelane_imm_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -1249,15 +1237,15 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX1100-GISEL-LABEL: test_writelane_imm_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, 32 -; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, 32 +; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm @@ -1270,7 +1258,7 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1285,7 +1273,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-SDAG-LABEL: test_writelane_imm_i64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1299,7 +1287,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-SDAG-LABEL: test_writelane_imm_i64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1315,7 +1303,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1330,7 +1318,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1010-GISEL-LABEL: test_writelane_imm_i64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1344,7 +1332,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; ; GFX1100-GISEL-LABEL: test_writelane_imm_i64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1366,7 +1354,7 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double %src0) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1381,7 +1369,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-SDAG-LABEL: test_writelane_imm_f64: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1395,7 +1383,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-SDAG-LABEL: test_writelane_imm_f64: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1411,7 +1399,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1426,7 +1414,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1010-GISEL-LABEL: test_writelane_imm_f64: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -1440,7 +1428,7 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; ; GFX1100-GISEL-LABEL: test_writelane_imm_f64: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -1462,10 +1450,10 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s6 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 @@ -1476,11 +1464,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1010-SDAG-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm @@ -1488,8 +1476,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 +; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 @@ -1501,10 +1489,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 @@ -1515,11 +1503,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s3 ; GFX1010-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm @@ -1527,8 +1515,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x8 +; GFX1100-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x8 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1545,12 +1533,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1562,30 +1550,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1593,13 +1581,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1610,30 +1598,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1646,12 +1634,12 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_mov_b32 m0, s8 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1663,30 +1651,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x2 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-SDAG-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-SDAG-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s5, s8 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s4, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s7, s8 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x2 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-SDAG-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, s5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s1, s2 -; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s0, s2 +; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s3, s0 +; GFX1100-SDAG-NEXT: v_writelane_b32 v0, s2, s0 ; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1694,13 +1682,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; ; GFX802-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 -; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x18 +; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s8 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 @@ -1711,30 +1699,30 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x2 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX1010-GISEL-NEXT: s_load_dword s8, s[6:7], 0x18 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX1010-GISEL-NEXT: s_load_dword s8, s[4:5], 0x18 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s4, s8 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s5, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s6, s8 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_sreg_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x2 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX1100-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x18 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x18 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s0, s2 -; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s1, s2 +; GFX1100-GISEL-NEXT: v_writelane_b32 v0, s2, s0 +; GFX1100-GISEL-NEXT: v_writelane_b32 v1, s3, s0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1747,7 +1735,7 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1759,7 +1747,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-SDAG: ; %bb.0: -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1769,7 +1757,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1781,7 +1769,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1793,7 +1781,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1010-GISEL: ; %bb.0: -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1803,7 +1791,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1820,8 +1808,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, i64 %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1836,22 +1824,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1865,12 +1853,12 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 @@ -1881,22 +1869,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_i64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1915,8 +1903,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, double %src0, i32 %src1) #1 { ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX802-SDAG: ; %bb.0: -; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX802-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX802-SDAG-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1931,22 +1919,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-SDAG: ; %bb.0: ; GFX1010-SDAG-NEXT: s_clause 0x1 -; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-SDAG-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s4 -; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s4 +; GFX1010-SDAG-NEXT: v_writelane_b32 v1, s3, s6 +; GFX1010-SDAG-NEXT: v_writelane_b32 v0, s2, s6 ; GFX1010-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-SDAG-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -1960,12 +1948,12 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX802-GISEL: ; %bb.0: -; GFX802-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 -; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX802-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 @@ -1976,22 +1964,22 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1010-GISEL: ; %bb.0: ; GFX1010-GISEL-NEXT: s_clause 0x1 -; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX1010-GISEL-NEXT: s_load_dword s4, s[6:7], 0x10 +; GFX1010-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX1010-GISEL-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1010-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1010-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s4 -; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s4 +; GFX1010-GISEL-NEXT: v_writelane_b32 v0, s2, s6 +; GFX1010-GISEL-NEXT: v_writelane_b32 v1, s3, s6 ; GFX1010-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-GISEL-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: test_writelane_imm_oldval_f64: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX1100-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 5cf457d1753b30..eeddb3d5b81923 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) define amdgpu_kernel void @ceil_f16( ; SI-LABEL: ceil_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @ceil_f16( ; ; VI-LABEL: ceil_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-LABEL: ceil_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -68,7 +68,7 @@ define amdgpu_kernel void @ceil_f16( ; ; GFX11-FAKE16-LABEL: ceil_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -104,7 +104,7 @@ entry: define amdgpu_kernel void @ceil_v2f16( ; SI-LABEL: ceil_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; VI-LABEL: ceil_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -150,7 +150,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-LABEL: ceil_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @ceil_v2f16( ; ; GFX11-FAKE16-LABEL: ceil_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 5514efa6838e73..fcc4cb3436fd7a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,7 +46,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -58,7 +58,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: cos_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -70,7 +70,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: cos_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -91,7 +91,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: cos_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -142,7 +142,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: cos_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: cos_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: cos_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 142145098df87f..3a867879bb809b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -12,34 +12,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -47,34 +46,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -82,16 +80,16 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -99,36 +97,36 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -136,10 +134,10 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -164,29 +162,29 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -338,7 +336,7 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -390,7 +388,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -442,7 +440,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 @@ -481,7 +479,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -520,7 +518,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -562,7 +560,7 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -853,25 +851,25 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -917,7 +915,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -926,19 +923,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -990,7 +987,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -999,11 +995,11 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1052,10 +1048,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_exp_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1105,10 +1101,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1160,10 +1156,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_exp_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1594,26 +1590,26 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1677,7 +1673,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1686,28 +1681,29 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_exp_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1723,7 +1719,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1748,7 +1744,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1769,7 +1764,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1778,11 +1772,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_exp_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1793,8 +1787,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1839,16 +1833,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1910,11 +1905,11 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-SDAG-LABEL: s_exp_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1926,7 +1921,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1972,16 +1967,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 4d981d27c309ea..a162949587481e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -14,34 +14,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-LABEL: s_exp10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, s0, v3 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -49,34 +48,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2 +; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -84,16 +82,16 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 @@ -101,36 +99,36 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] @@ -138,10 +136,10 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-SDAG-LABEL: s_exp10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0 @@ -166,29 +164,29 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2 ; SI-GISEL-NEXT: v_rndne_f32_e32 v3, v2 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0 +; SI-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3 ; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 +; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; @@ -340,7 +338,7 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 @@ -392,7 +390,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -444,7 +442,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 @@ -483,7 +481,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -522,7 +520,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -564,7 +562,7 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 @@ -855,25 +853,25 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7 @@ -919,7 +917,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 @@ -928,19 +925,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v2 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5 @@ -992,7 +989,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0 @@ -1001,11 +997,11 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 @@ -1054,10 +1050,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 ; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5 @@ -1107,10 +1103,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v0 @@ -1162,10 +1158,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1 @@ -1596,26 +1592,26 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s2, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2 ; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7 @@ -1679,7 +1675,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 -; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0 @@ -1688,28 +1683,29 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc ; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s0, s4, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v0, s4, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 ; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v3 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v3 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v1 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v4 ; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1 @@ -1725,7 +1721,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000 ; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6 ; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2 ; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6 @@ -1750,7 +1746,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9 ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8 ; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000 @@ -1771,7 +1766,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5 -; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1780,11 +1774,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 +; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1795,8 +1789,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3 @@ -1841,16 +1835,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 @@ -1912,11 +1907,11 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-SDAG-LABEL: s_exp10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; SI-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v3, v2 @@ -1928,7 +1923,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; SI-SDAG-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; SI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4 -; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000 +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_ldexp_f32_e32 v2, v2, v3 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0 ; SI-SDAG-NEXT: v_rndne_f32_e32 v6, v3 @@ -1974,16 +1969,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37 ; SI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2 ; SI-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 9f80e66e8f8731..36e78975cdb015 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -12,17 +12,17 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_exp2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 @@ -31,35 +31,35 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_exp2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -69,14 +69,14 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_exp2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_exp2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -101,25 +101,24 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; @@ -173,7 +172,7 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -199,7 +198,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -223,7 +222,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -247,7 +246,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -271,7 +270,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 @@ -294,7 +293,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x1f800000 @@ -381,8 +380,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -413,8 +412,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 @@ -445,8 +444,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -476,11 +475,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -507,8 +506,8 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -532,16 +531,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x1f800000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc @@ -656,45 +655,45 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_exp2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; SI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_add_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_add_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_add_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_add_f32_e32 v1, s0, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_exp_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_exp_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_exp_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -730,8 +729,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -767,8 +766,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_exp2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -804,8 +803,8 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 @@ -835,13 +834,13 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 @@ -871,7 +870,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; R600-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index ece55c7f7dceaa..e8d037c5ff53e0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) define amdgpu_kernel void @floor_f16( ; SI-LABEL: floor_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @floor_f16( ; ; VI-LABEL: floor_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-LABEL: floor_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -68,7 +68,7 @@ define amdgpu_kernel void @floor_f16( ; ; GFX11-FAKE16-LABEL: floor_f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 @@ -105,7 +105,7 @@ entry: define amdgpu_kernel void @floor_v2f16( ; SI-LABEL: floor_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; VI-LABEL: floor_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-LABEL: floor_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -180,7 +180,7 @@ define amdgpu_kernel void @floor_v2f16( ; ; GFX11-FAKE16-LABEL: floor_v2f16: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index edcdd323cb0aee..a2e30603b6afcd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -14,7 +14,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> define amdgpu_kernel void @fmuladd_f16( ; SI-LABEL: fmuladd_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -48,7 +48,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-FLUSH-LABEL: fmuladd_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -76,7 +76,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; VI-DENORM-LABEL: fmuladd_f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -104,7 +104,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-FLUSH-LABEL: fmuladd_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -134,7 +134,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX10-DENORM-LABEL: fmuladd_f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 @@ -162,7 +162,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-FLUSH-LABEL: fmuladd_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -195,7 +195,7 @@ define amdgpu_kernel void @fmuladd_f16( ; ; GFX11-DENORM-LABEL: fmuladd_f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -237,131 +237,131 @@ define amdgpu_kernel void @fmuladd_f16( define amdgpu_kernel void @fmuladd_f16_imm_a( ; SI-LABEL: fmuladd_f16_imm_a: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_a: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s10, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s2 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_mov_b32 s12, s6 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s11 -; VI-FLUSH-NEXT: s_mov_b32 s2, s10 -; VI-FLUSH-NEXT: s_mov_b32 s3, s11 +; VI-FLUSH-NEXT: s_mov_b32 s15, s3 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s8, s4 -; VI-FLUSH-NEXT: s_mov_b32 s9, s5 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_a: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s10, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s2, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s2 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_mov_b32 s12, s6 ; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s11 -; VI-DENORM-NEXT: s_mov_b32 s2, s10 -; VI-DENORM-NEXT: s_mov_b32 s3, s11 +; VI-DENORM-NEXT: s_mov_b32 s15, s3 +; VI-DENORM-NEXT: s_mov_b32 s10, s2 +; VI-DENORM-NEXT: s_mov_b32 s11, s3 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s8, s4 -; VI-DENORM-NEXT: s_mov_b32 s9, s5 -; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-DENORM-NEXT: s_mov_b32 s0, s4 +; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s1, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -388,8 +388,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -423,131 +423,131 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI-LABEL: fmuladd_f16_imm_b: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-FLUSH-LABEL: fmuladd_f16_imm_b: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 -; VI-FLUSH-NEXT: s_mov_b32 s10, -1 -; VI-FLUSH-NEXT: s_mov_b32 s14, s10 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s14, s2 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_mov_b32 s12, s6 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7 -; VI-FLUSH-NEXT: s_mov_b32 s15, s11 -; VI-FLUSH-NEXT: s_mov_b32 s2, s10 -; VI-FLUSH-NEXT: s_mov_b32 s3, s11 +; VI-FLUSH-NEXT: s_mov_b32 s15, s3 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s8, s4 -; VI-FLUSH-NEXT: s_mov_b32 s9, s5 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1 -; VI-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-FLUSH-NEXT: s_endpgm ; ; VI-DENORM-LABEL: fmuladd_f16_imm_b: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 -; VI-DENORM-NEXT: s_mov_b32 s10, -1 -; VI-DENORM-NEXT: s_mov_b32 s14, s10 +; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000 +; VI-DENORM-NEXT: s_mov_b32 s2, -1 +; VI-DENORM-NEXT: s_mov_b32 s14, s2 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_mov_b32 s12, s6 ; VI-DENORM-NEXT: s_mov_b32 s13, s7 -; VI-DENORM-NEXT: s_mov_b32 s15, s11 -; VI-DENORM-NEXT: s_mov_b32 s2, s10 -; VI-DENORM-NEXT: s_mov_b32 s3, s11 +; VI-DENORM-NEXT: s_mov_b32 s15, s3 +; VI-DENORM-NEXT: s_mov_b32 s10, s2 +; VI-DENORM-NEXT: s_mov_b32 s11, s3 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200 -; VI-DENORM-NEXT: s_mov_b32 s8, s4 -; VI-DENORM-NEXT: s_mov_b32 s9, s5 -; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1 -; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-DENORM-NEXT: s_mov_b32 s0, s4 +; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200 +; VI-DENORM-NEXT: s_mov_b32 s1, s5 +; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1 +; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-DENORM-NEXT: s_endpgm ; ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_clause 0x1 -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX10-FLUSH-NEXT: s_mov_b32 s2, s10 -; GFX10-FLUSH-NEXT: s_mov_b32 s3, s11 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3 +; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7 -; GFX10-FLUSH-NEXT: s_mov_b32 s8, s4 +; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: s_mov_b32 s9, s5 +; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX10-DENORM: ; %bb.0: ; GFX10-DENORM-NEXT: s_clause 0x1 -; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX10-DENORM-NEXT: s_mov_b32 s2, s10 -; GFX10-DENORM-NEXT: s_mov_b32 s3, s11 +; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-DENORM-NEXT: s_mov_b32 s2, -1 +; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-DENORM-NEXT: s_mov_b32 s14, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s15, s3 +; GFX10-DENORM-NEXT: s_mov_b32 s10, s2 +; GFX10-DENORM-NEXT: s_mov_b32 s11, s3 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: s_mov_b32 s8, s4 -; GFX10-DENORM-NEXT: s_mov_b32 s9, s5 +; GFX10-DENORM-NEXT: s_mov_b32 s0, s4 +; GFX10-DENORM-NEXT: s_mov_b32 s1, s5 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 -; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-DENORM-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b: ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -574,8 +574,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b: ; GFX11-DENORM: ; %bb.0: ; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 @@ -609,7 +609,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( define amdgpu_kernel void @fmuladd_v2f16( ; SI-LABEL: fmuladd_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -653,7 +653,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-FLUSH-LABEL: fmuladd_v2f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -686,7 +686,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; VI-DENORM-LABEL: fmuladd_v2f16: ; VI-DENORM: ; %bb.0: -; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000 ; VI-DENORM-NEXT: s_mov_b32 s10, -1 ; VI-DENORM-NEXT: s_mov_b32 s14, s10 @@ -722,7 +722,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-FLUSH-LABEL: fmuladd_v2f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -752,7 +752,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX10-DENORM-LABEL: fmuladd_v2f16: ; GFX10-DENORM: ; %bb.0: -; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10 @@ -780,7 +780,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-FLUSH-LABEL: fmuladd_v2f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 @@ -813,7 +813,7 @@ define amdgpu_kernel void @fmuladd_v2f16( ; ; GFX11-DENORM-LABEL: fmuladd_v2f16: ; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll index 2bb4cc617e7f17..aca7d3c720ceb5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll @@ -83,7 +83,7 @@ define i32 @strictfp_func_fpmode_i32() strictfp { define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; GFX6-LABEL: kernel_fpmode_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX6-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -95,7 +95,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX7-LABEL: kernel_fpmode_i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19) ; GFX7-NEXT: s_and_b32 s4, 0x7f3ff, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX8-LABEL: kernel_fpmode_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 19) ; GFX8-NEXT: s_and_b32 s2, 0x7f3ff, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -119,7 +119,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX9-LABEL: kernel_fpmode_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX9-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -130,7 +130,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX10-LABEL: kernel_fpmode_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_and_b32 s2, 0x87f3ff, s2 @@ -141,7 +141,7 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) { ; ; GFX11-LABEL: kernel_fpmode_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24) ; GFX11-NEXT: s_and_b32 s2, 0x87f3ff, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 2e8049e9765e18..ea823f30f26c22 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -14,8 +14,8 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX7CHECK-LABEL: sgpr_isnan_bf16: ; GFX7CHECK: ; %bb.0: -; GFX7CHECK-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7CHECK-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7CHECK-NEXT: s_mov_b32 s3, 0xf000 ; GFX7CHECK-NEXT: s_mov_b32 s2, -1 ; GFX7CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -28,13 +28,13 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_bf16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX8CHECK-NEXT: s_movk_i32 s2, 0x7f80 +; GFX8CHECK-NEXT: s_movk_i32 s3, 0x7f80 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v0 +; GFX8CHECK-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s3, v0 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -43,26 +43,26 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_bf16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v1, 0x7fff -; GFX9CHECK-NEXT: s_movk_i32 s2, 0x7f80 +; GFX9CHECK-NEXT: s_movk_i32 s0, 0x7f80 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX9CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s2, v1 +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s0, v1 ; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_bf16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v1, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 +; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX10CHECK-NEXT: global_store_dword v1, v0, s[0:1] @@ -71,11 +71,11 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX11CHECK-LABEL: sgpr_isnan_bf16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v1, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4 +; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2 ; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 9c248bd6e8b2aa..da64c379672ef7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f16: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -27,11 +27,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f16: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: s_and_b32 s3, s4, 0x7fff +; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff ; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00 ; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0 @@ -43,10 +43,10 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX8CHECK-LABEL: sgpr_isnan_f16: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 +; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -55,23 +55,23 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f16: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s4, 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[0:1], s4, 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 +; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -79,11 +79,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f16: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s4, 3 +; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11CHECK-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index a807885e0d8539..347e549e7cf566 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f32: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX7SELDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s3, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s2, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -25,22 +25,22 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f32: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dword s3, s[0:1], 0xb +; GFX7GLISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7GLISEL-NEXT: s_mov_b32 s2, -1 -; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s4, 3 +; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7GLISEL-NEXT: s_endpgm ; ; GFX8CHECK-LABEL: sgpr_isnan_f32: ; GFX8CHECK: ; %bb.0: -; GFX8CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 +; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3 ; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -49,23 +49,23 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f32: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 3 -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] -; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9CHECK-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9CHECK-NEXT: s_endpgm ; ; GFX10CHECK-LABEL: sgpr_isnan_f32: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_clause 0x1 -; GFX10CHECK-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 +; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm @@ -73,11 +73,11 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX11CHECK-LABEL: sgpr_isnan_f32: ; GFX11CHECK: ; %bb.0: ; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s4, 3 +; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3 ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] @@ -93,7 +93,7 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f64: ; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7SELDAG-NEXT: s_mov_b32 s7, 0xf000 ; GFX7SELDAG-NEXT: s_mov_b32 s6, -1 ; GFX7SELDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -106,7 +106,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX7GLISEL-LABEL: sgpr_isnan_f64: ; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX7GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] @@ -117,7 +117,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8SELDAG-LABEL: sgpr_isnan_f64: ; GFX8SELDAG: ; %bb.0: -; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -128,7 +128,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX8GLISEL-LABEL: sgpr_isnan_f64: ; GFX8GLISEL: ; %bb.0: -; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -139,7 +139,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX9CHECK-LABEL: sgpr_isnan_f64: ; GFX9CHECK: ; %bb.0: -; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3 @@ -149,7 +149,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX10CHECK-LABEL: sgpr_isnan_f64: ; GFX10CHECK: ; %bb.0: -; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 @@ -159,7 +159,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) { ; ; GFX11CHECK-LABEL: sgpr_isnan_f64: ; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 ; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) ; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index c2f6fbfe4667c0..d847af780acab3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,6 +94,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -101,15 +102,15 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 @@ -125,6 +126,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -132,17 +134,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3377d1cf ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -154,20 +156,20 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x41b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -181,18 +183,19 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,9 +207,8 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,13 +216,14 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -230,11 +233,10 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -316,7 +318,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3377d1cf @@ -357,7 +359,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -396,7 +398,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -443,7 +445,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -490,7 +492,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217 @@ -528,7 +530,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -566,7 +568,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-SDAG-LABEL: s_log_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -601,7 +603,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -747,8 +749,8 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -800,8 +802,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -853,7 +855,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -862,7 +864,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3 @@ -919,8 +921,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -984,8 +986,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,8 +1037,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3f317217 @@ -1087,19 +1089,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1120,7 +1122,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1143,19 +1145,19 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1176,7 +1178,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1353,8 +1355,8 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) { define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3377d1cf @@ -1417,8 +1419,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; SI-GISEL-LABEL: s_log_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1481,8 +1483,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-SDAG-LABEL: s_log_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1563,8 +1565,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; VI-GISEL-LABEL: s_log_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,8 +1647,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-SDAG-LABEL: s_log_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3377d1cf @@ -1708,8 +1710,8 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; ; GFX900-GISEL-LABEL: s_log_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3f317217 @@ -1772,32 +1774,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-SDAG-LABEL: s_log_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 @@ -1833,32 +1835,32 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; GFX1100-GISEL-LABEL: s_log_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 0a1f7ab6fc0ae3..3f060de9f6596d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log10_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 ; SI-SDAG-NEXT: v_fma_f32 v2, v0, s0, -v1 @@ -42,15 +42,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log10_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -70,15 +70,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-SDAG-LABEL: s_log10_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 +; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s0, 0x7f800000 ; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -94,6 +94,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -101,15 +102,15 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log10_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 @@ -125,6 +126,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -132,17 +134,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log10_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf +; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX900-SDAG-NEXT: s_mov_b32 s1, 0x3284fbcf ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 ; GFX900-SDAG-NEXT: v_fma_f32 v3, v0, s0, -v2 @@ -154,20 +156,20 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x411a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-SDAG-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log10_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX900-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3284fbcf ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -181,18 +183,19 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log10_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff @@ -204,9 +207,8 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -214,13 +216,14 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log10_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff @@ -230,11 +233,10 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -316,7 +318,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log10_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x3284fbcf @@ -357,7 +359,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -396,7 +398,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000 @@ -443,7 +445,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000 @@ -490,7 +492,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3e9a209a @@ -528,7 +530,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -566,7 +568,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log10_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 @@ -601,7 +603,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -747,8 +749,8 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log10_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -800,8 +802,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -853,7 +855,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_mov_b32 s8, 0x7f800000 @@ -862,7 +864,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc ; VI-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v2 -; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_and_b32_e32 v3, 0xfffff000, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v4, v2, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v3 @@ -919,8 +921,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -984,8 +986,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1035,8 +1037,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3e9a209a @@ -1087,19 +1089,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v3f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 @@ -1120,7 +1122,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1143,19 +1145,19 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 @@ -1176,7 +1178,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1 ; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -1353,8 +1355,8 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log10_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: s_mov_b32 s12, 0x3284fbcf @@ -1417,8 +1419,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log10_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1481,8 +1483,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log10_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1563,8 +1565,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log10_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1645,8 +1647,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log10_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-SDAG-NEXT: s_mov_b32 s10, 0x3284fbcf @@ -1708,8 +1710,8 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-GISEL-LABEL: s_log10_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x3e9a209a @@ -1772,32 +1774,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-SDAG-LABEL: s_log10_v4f32: ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_clause 0x1 -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 @@ -1833,32 +1835,32 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log10_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 7ca04cc2356053..035b2439eff153 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -14,17 +14,17 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; SI-SDAG-LABEL: s_log2_f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; SI-SDAG-NEXT: s_mov_b32 s2, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 @@ -33,35 +33,35 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; SI-GISEL-LABEL: s_log2_f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -71,14 +71,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; VI-GISEL-LABEL: s_log2_f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -90,8 +90,8 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX900-SDAG-LABEL: s_log2_f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -103,44 +103,43 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1] +; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s4 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -148,19 +147,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { ; ; GFX1100-GISEL-LABEL: s_log2_f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX1100-GISEL-NEXT: s_clause 0x1 +; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s0 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4 -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 -; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -216,7 +215,7 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; SI-SDAG-LABEL: s_log2_v2f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -242,7 +241,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v2f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -266,7 +265,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -290,7 +289,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -314,7 +313,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 @@ -337,7 +336,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-GISEL-LABEL: s_log2_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42000000 @@ -360,7 +359,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 @@ -385,7 +384,7 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 @@ -473,8 +472,8 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; SI-SDAG-LABEL: s_log2_v3f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -505,8 +504,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v3f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 @@ -537,8 +536,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v3f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -568,11 +567,11 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v3f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -599,8 +598,8 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v3f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -624,16 +623,16 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0 -; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx3 v7, v[0:2], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v3f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42000000 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc @@ -659,32 +658,32 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v3f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s5 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s1 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 +; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -693,21 +692,21 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v3f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 @@ -814,45 +813,45 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; SI-SDAG-LABEL: s_log2_v4f32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v6, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 1.0, v3, vcc -; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 +; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v3, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 -; SI-SDAG-NEXT: v_mul_f32_e32 v6, s6, v6 -; SI-SDAG-NEXT: v_mul_f32_e32 v8, s5, v8 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 +; SI-SDAG-NEXT: v_mul_f32_e32 v6, s2, v6 +; SI-SDAG-NEXT: v_mul_f32_e32 v8, s1, v8 +; SI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v1 ; SI-SDAG-NEXT: v_log_f32_e32 v4, v4 ; SI-SDAG-NEXT: v_log_f32_e32 v6, v6 ; SI-SDAG-NEXT: v_log_f32_e32 v8, v8 ; SI-SDAG-NEXT: v_log_f32_e32 v9, v1 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 ; SI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2 ; SI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5 ; SI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7 ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0 -; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -888,8 +887,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v4f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -925,8 +924,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; VI-GISEL-LABEL: s_log2_v4f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -962,8 +961,8 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v4f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000 @@ -993,13 +992,13 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v7, v6 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v9, v8 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v10, v0 -; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v4f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX900-GISEL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 @@ -1029,41 +1028,42 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX900-GISEL-NEXT: s_endpgm ; ; GFX1100-SDAG-LABEL: s_log2_v4f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 +; GFX1100-SDAG-NEXT: s_clause 0x1 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5 ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s1 -; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3 ; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0 ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 -; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1072,32 +1072,32 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; GFX1100-GISEL-LABEL: s_log2_v4f32: ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_clause 0x1 -; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9 ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 5d3a5800bcdd8f..fa7ee9e8d28ff6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -425,8 +425,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX7-LABEL: s_maximum_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -442,10 +442,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_maximum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_max_f16_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_max_f16_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -456,10 +456,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_maximum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_max_f16_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_max_f16_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -485,8 +485,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_maximum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f16_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_max_f16_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -870,10 +870,10 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s16 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, s6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, s4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -897,16 +897,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s7, 16 -; GFX8-NEXT: s_lshr_b32 s5, s6, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_max_f16_e32 v1, s5, v0 +; GFX8-NEXT: s_lshr_b32 s6, s5, 16 +; GFX8-NEXT: s_lshr_b32 s7, s4, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_max_f16_e32 v1, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_max_f16_e32 v3, s6, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_max_f16_e32 v3, s4, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -918,17 +918,17 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s4, s7, 16 -; GFX9-NEXT: v_pk_max_f16 v1, s6, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: v_pk_max_f16 v1, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -963,13 +963,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 -; GFX10-NEXT: s_lshr_b32 s4, s7, 16 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: v_pk_max_f16 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 +; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s4, s4, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index b9b5994dae5eef..4d2ed9be77a1bf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -401,10 +401,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_maximum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_max_f32_e32 v1, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_maximum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_max_f32_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_maximum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_max_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_max_f32_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_maximum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_max_f32_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -781,14 +781,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_maximum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s17 -; GFX7-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_max_f32_e32 v1, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_max_f32_e32 v3, s4, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -798,14 +798,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_maximum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s17 -; GFX8-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_max_f32_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_max_f32_e32 v3, s4, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -815,14 +815,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_maximum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_max_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_max_f32_e32 v1, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_max_f32_e32 v3, s6, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_max_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -850,11 +850,11 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v0, s7, s17 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 -; GFX10-NEXT: v_max_f32_e64 v2, s6, s16 +; GFX10-NEXT: v_max_f32_e64 v0, s5, s7 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 +; GFX10-NEXT: v_max_f32_e64 v2, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index 05ffdaecad3ae1..52825205c526e6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -427,10 +427,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_maximum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_maximum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_maximum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_max_f64 v[2:3], s[4:5], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_maximum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[16:17] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] +; GFX10-NEXT: v_max_f64 v[0:1], s[4:5], s[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_maximum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX7-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_maximum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s21 -; GFX8-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_maximum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_max_f64 v[0:1], s[6:7], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_maximum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX940-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,14 +927,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_maximum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[20:21] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] -; GFX10-NEXT: v_max_f64 v[4:5], s[6:7], s[18:19] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5 +; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], s[4:5], s[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND @@ -943,10 +943,10 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_maximum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[16:17] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] -; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[4:5] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[16:17] -; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[6:7] +; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[6:7] +; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[4:5] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index c7913f638798ac..d056a97dc54442 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -13,111 +13,111 @@ declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @maxnum_f16( ; SI-LABEL: maxnum_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -155,7 +155,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-LABEL: maxnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; VI-LABEL: maxnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX9-LABEL: maxnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -213,7 +213,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX10-LABEL: maxnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -232,7 +232,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; ; GFX11-LABEL: maxnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -263,7 +263,7 @@ entry: define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-LABEL: maxnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -283,7 +283,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; VI-LABEL: maxnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -302,7 +302,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX9-LABEL: maxnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -321,7 +321,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX10-LABEL: maxnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -340,7 +340,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; ; GFX11-LABEL: maxnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -371,8 +371,8 @@ entry: define amdgpu_kernel void @maxnum_v2f16( ; SI-LABEL: maxnum_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -396,8 +396,8 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; VI-LABEL: maxnum_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -420,18 +420,18 @@ define amdgpu_kernel void @maxnum_v2f16( ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -439,16 +439,16 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX10-LABEL: maxnum_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -456,8 +456,8 @@ define amdgpu_kernel void @maxnum_v2f16( ; GFX11-LABEL: maxnum_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -486,7 +486,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_a( ; SI-LABEL: maxnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -506,7 +506,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; VI-LABEL: maxnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -524,7 +524,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -538,7 +538,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX10-LABEL: maxnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -551,7 +551,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; ; GFX11-LABEL: maxnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -576,7 +576,7 @@ entry: define amdgpu_kernel void @maxnum_v2f16_imm_b( ; SI-LABEL: maxnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -596,7 +596,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; VI-LABEL: maxnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -628,7 +628,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX10-LABEL: maxnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -641,7 +641,7 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; ; GFX11-LABEL: maxnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -667,8 +667,8 @@ entry: define amdgpu_kernel void @maxnum_v3f16( ; SI-LABEL: maxnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -697,8 +697,8 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; VI-LABEL: maxnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -725,21 +725,21 @@ define amdgpu_kernel void @maxnum_v3f16( ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -748,17 +748,17 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX10-LABEL: maxnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 @@ -769,8 +769,8 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX11-LABEL: maxnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -804,28 +804,28 @@ entry: define amdgpu_kernel void @maxnum_v4f16( ; SI-LABEL: maxnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: s_lshr_b32 s2, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 ; SI-NEXT: v_max_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -838,13 +838,13 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: maxnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -876,21 +876,21 @@ define amdgpu_kernel void @maxnum_v4f16( ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -898,17 +898,17 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX10-LABEL: maxnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 @@ -918,8 +918,8 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX11-LABEL: maxnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -951,7 +951,7 @@ entry: define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-LABEL: fmax_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -980,7 +980,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; VI-LABEL: fmax_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1007,7 +1007,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX9-LABEL: fmax_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1026,7 +1026,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX10-LABEL: fmax_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1041,7 +1041,7 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; ; GFX11-LABEL: fmax_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 01effc24e741d1..e00ebff751c73e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -351,10 +351,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX8-LABEL: s_minimum_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_min_f16_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_min_f16_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: ;;#ASMSTART @@ -365,10 +365,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX9-LABEL: s_minimum_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_min_f16_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_min_f16_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: ;;#ASMSTART @@ -394,8 +394,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX10-LABEL: s_minimum_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f16_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_min_f16_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: ;;#ASMSTART @@ -709,16 +709,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s7, 16 -; GFX8-NEXT: s_lshr_b32 s5, s6, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_min_f16_e32 v1, s5, v0 +; GFX8-NEXT: s_lshr_b32 s6, s5, 16 +; GFX8-NEXT: s_lshr_b32 s7, s4, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_min_f16_e32 v1, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_min_f16_e32 v3, s6, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_min_f16_e32 v3, s4, v1 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -730,17 +730,17 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_lshr_b32 s4, s7, 16 -; GFX9-NEXT: v_pk_min_f16 v1, s6, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: v_pk_min_f16 v1, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s6, v0 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v0 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -775,13 +775,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s6, s7 -; GFX10-NEXT: s_lshr_b32 s4, s7, 16 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: v_pk_min_f16 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s5 +; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_lshr_b32 s4, s4, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 +; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 1fefa743409152..9a58235e2a9f7c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -401,10 +401,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX7-LABEL: s_minimum_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_min_f32_e32 v1, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v0 @@ -414,10 +414,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX8-LABEL: s_minimum_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_min_f32_e32 v1, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -427,10 +427,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX9-LABEL: s_minimum_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_min_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_min_f32_e32 v1, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 @@ -454,8 +454,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX10-LABEL: s_minimum_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s6, s7 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s7 +; GFX10-NEXT: v_min_f32_e64 v0, s4, s5 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 @@ -781,14 +781,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7-LABEL: s_minimum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s17 -; GFX7-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_min_f32_e32 v1, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX7-NEXT: v_min_f32_e32 v3, s4, v0 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] @@ -798,14 +798,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-LABEL: s_minimum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s17 -; GFX8-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_min_f32_e32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX8-NEXT: v_min_f32_e32 v3, s4, v0 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] @@ -815,14 +815,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9-LABEL: s_minimum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_min_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_min_f32_e32 v1, s5, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_min_f32_e32 v3, s6, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s6, v0 +; GFX9-NEXT: v_min_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] @@ -850,11 +850,11 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f32_e64 v0, s7, s17 -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s7, s17 -; GFX10-NEXT: v_min_f32_e64 v2, s6, s16 +; GFX10-NEXT: v_min_f32_e64 v0, s5, s7 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 +; GFX10-NEXT: v_min_f32_e64 v2, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo -; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s6, s16 +; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 2584e3394583a9..878b65d7f32e1a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -427,10 +427,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX7-LABEL: s_minimum_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -442,10 +442,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX8-LABEL: s_minimum_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -457,10 +457,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX9-LABEL: s_minimum_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_min_f64 v[2:3], s[4:5], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc @@ -487,8 +487,8 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX10-LABEL: s_minimum_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[16:17] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[6:7], s[16:17] +; GFX10-NEXT: v_min_f64 v[0:1], s[4:5], s[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: ;;#ASMSTART @@ -844,14 +844,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7-LABEL: s_minimum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX7-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -865,14 +865,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-LABEL: s_minimum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s21 -; GFX8-NEXT: v_mov_b32_e32 v5, s19 -; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX8-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -886,14 +886,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9-LABEL: s_minimum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s20 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_min_f64 v[0:1], s[6:7], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -907,11 +907,11 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940-LABEL: s_minimum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] ; GFX940-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX940-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc @@ -927,14 +927,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX10-LABEL: s_minimum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[20:21] -; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21] -; GFX10-NEXT: v_min_f64 v[4:5], s[6:7], s[18:19] -; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[6:7], s[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5 +; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] +; GFX10-NEXT: v_min_f64 v[4:5], s[4:5], s[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND @@ -943,10 +943,10 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX11-LABEL: s_minimum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[16:17] -; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[16:17] -; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[6:7] -; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[4:5] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 @@ -964,8 +964,8 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[16:17] -; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[6:7] +; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[6:7] +; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[4:5] ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use v[0:3] ; GFX12-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 0a004fd7701cfc..f934a2de9247f0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -13,111 +13,111 @@ declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) define amdgpu_kernel void @minnum_f16_ieee( ; SI-LABEL: minnum_f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc +; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc dlc +; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -182,7 +182,7 @@ define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { define amdgpu_kernel void @minnum_f16_imm_a( ; SI-LABEL: minnum_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -202,7 +202,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; VI-LABEL: minnum_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX9-LABEL: minnum_f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -240,7 +240,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX10-LABEL: minnum_f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -259,7 +259,7 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; ; GFX11-LABEL: minnum_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -289,7 +289,7 @@ entry: define amdgpu_kernel void @minnum_f16_imm_b( ; SI-LABEL: minnum_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -309,7 +309,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; VI-LABEL: minnum_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX9-LABEL: minnum_f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -347,7 +347,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX10-LABEL: minnum_f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -366,7 +366,7 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; ; GFX11-LABEL: minnum_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -396,8 +396,8 @@ entry: define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-LABEL: minnum_v2f16_ieee: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -421,8 +421,8 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; VI-LABEL: minnum_v2f16_ieee: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -445,18 +445,18 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -464,16 +464,16 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX10-LABEL: minnum_v2f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -481,8 +481,8 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; GFX11-LABEL: minnum_v2f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -545,7 +545,7 @@ define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) define amdgpu_kernel void @minnum_v2f16_imm_a( ; SI-LABEL: minnum_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -565,7 +565,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; VI-LABEL: minnum_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -583,7 +583,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -597,7 +597,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX10-LABEL: minnum_v2f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -610,7 +610,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; ; GFX11-LABEL: minnum_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -634,7 +634,7 @@ entry: define amdgpu_kernel void @minnum_v2f16_imm_b( ; SI-LABEL: minnum_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -654,7 +654,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; VI-LABEL: minnum_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 @@ -672,7 +672,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -686,7 +686,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX10-LABEL: minnum_v2f16_imm_b: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 @@ -699,7 +699,7 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; ; GFX11-LABEL: minnum_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 @@ -724,8 +724,8 @@ entry: define amdgpu_kernel void @minnum_v3f16( ; SI-LABEL: minnum_v3f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -754,8 +754,8 @@ define amdgpu_kernel void @minnum_v3f16( ; ; VI-LABEL: minnum_v3f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -782,21 +782,21 @@ define amdgpu_kernel void @minnum_v3f16( ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s8, s8 -; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v2, s9, s9 +; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 +; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -805,17 +805,17 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX10-LABEL: minnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 @@ -826,8 +826,8 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX11-LABEL: minnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -860,28 +860,28 @@ entry: define amdgpu_kernel void @minnum_v4f16( ; SI-LABEL: minnum_v4f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: s_lshr_b32 s2, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: s_lshr_b32 s2, s1, 16 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 ; SI-NEXT: v_min_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -894,13 +894,13 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: minnum_v4f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -932,21 +932,21 @@ define amdgpu_kernel void @minnum_v4f16( ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s9, s9 -; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v2, s8, s8 +; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 +; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -954,17 +954,17 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX10-LABEL: minnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX10-NEXT: v_pk_max_f16 v0, s1, s1 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 -; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 ; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 @@ -974,8 +974,8 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX11-LABEL: minnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -1006,7 +1006,7 @@ entry: define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-LABEL: fmin_v4f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; VI-LABEL: fmin_v4f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX9-LABEL: fmin_v4f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX10-LABEL: fmin_v4f16_imm_a: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; ; GFX11-LABEL: fmin_v4f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 53ea253035655c..c3e665fa8269a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -332,7 +332,7 @@ bb: define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: umulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -365,7 +365,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: umulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -394,7 +394,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX10-LABEL: umulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mul_i32 s7, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -423,7 +423,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: umulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -454,7 +454,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: umulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 @@ -491,7 +491,7 @@ bb: define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; SI-LABEL: smulo_i64_s: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 @@ -540,7 +540,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX9-LABEL: smulo_i64_s: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -581,7 +581,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX10-LABEL: smulo_i64_s: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mul_i32 s7, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -622,7 +622,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX11-LABEL: smulo_i64_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 @@ -667,7 +667,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; ; GFX12-LABEL: smulo_i64_s: ; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index 3d73f84b6e9a80..826862e1249203 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -17,12 +17,12 @@ define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -45,8 +45,8 @@ entry: define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -56,12 +56,12 @@ define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -84,8 +84,8 @@ entry: define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -95,12 +95,12 @@ define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -123,8 +123,8 @@ entry: define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xy: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s4, s4, s5 @@ -135,13 +135,13 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xy: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -166,12 +166,12 @@ entry: define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dword s5, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x6 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_mul_i32 s4, s2, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -179,11 +179,11 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x18 -; VI-NEXT: s_load_dword s5, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x18 +; VI-NEXT: s_load_dword s3, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s2, s4, s5 +; VI-NEXT: s_mul_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -211,7 +211,7 @@ entry: define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_yz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x7 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mul_i32 s0, s0, s1 @@ -224,7 +224,7 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_yz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x1c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -254,13 +254,13 @@ entry: define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; SI-LABEL: local_size_xyz: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x6 -; SI-NEXT: s_load_dword s6, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6 +; SI-NEXT: s_load_dword s2, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s2, s4, s5 -; SI-NEXT: s_add_i32 s4, s2, s6 +; SI-NEXT: s_mul_i32 s4, s4, s5 +; SI-NEXT: s_add_i32 s4, s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -268,15 +268,15 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_xyz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18 -; VI-NEXT: s_load_dword s4, s[2:3], 0x20 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18 +; VI-NEXT: s_load_dword s4, s[0:1], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_mul_i32 s2, s2, s3 +; VI-NEXT: s_add_i32 s2, s2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -304,8 +304,8 @@ entry: define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_x_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x6 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -315,12 +315,12 @@ define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_x_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x18 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x18 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -345,8 +345,8 @@ entry: define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_y_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x7 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x7 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -356,12 +356,12 @@ define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_y_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x1c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -386,8 +386,8 @@ entry: define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; SI-LABEL: local_size_z_known_bits: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -397,12 +397,12 @@ define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_z_known_bits: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x20 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 47dd0263d020ea..84afa3b0096ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -10,7 +10,7 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) define amdgpu_kernel void @rint_f16( ; SI-LABEL: rint_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @rint_f16( ; ; GFX89-LABEL: rint_f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_mov_b32 s10, s6 @@ -48,7 +48,7 @@ define amdgpu_kernel void @rint_f16( ; ; GFX11-LABEL: rint_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -85,7 +85,7 @@ entry: define amdgpu_kernel void @rint_v2f16( ; SI-LABEL: rint_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -111,7 +111,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; VI-LABEL: rint_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -131,7 +131,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX9-LABEL: rint_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @rint_v2f16( ; ; GFX11-LABEL: rint_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index fc962b1b4a377f..ddbc5ef4e5b600 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-LABEL: round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 @@ -41,7 +41,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; ; CI-LABEL: round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_brev_b32 s5, -2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 @@ -68,7 +68,7 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-LABEL: v_round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -108,7 +108,7 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -141,65 +141,64 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s9, 0xfffff +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s12, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s12 -; SI-NEXT: s_and_b32 s7, s11, 0x80000000 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_cmp_lt_i32 s12, 0 -; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s7, s5 -; SI-NEXT: s_cmp_gt_i32 s12, 51 -; SI-NEXT: s_cselect_b32 s12, s10, s4 -; SI-NEXT: s_cselect_b32 s13, s11, s5 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: s_and_b64 s[2:3], s[14:15], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s2, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s3, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s2, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s3, s1 -; SI-NEXT: s_cmp_gt_i32 s2, 51 -; SI-NEXT: s_cselect_b32 s1, s9, s1 -; SI-NEXT: s_cselect_b32 s0, s8, s0 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] +; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], s3 +; SI-NEXT: s_and_b32 s12, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[10:11], s[6:7], s[10:11] +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s10, 0, s10 +; SI-NEXT: s_cselect_b32 s11, s12, s11 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s10, s6, s10 +; SI-NEXT: s_cselect_b32 s11, s7, s11 +; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5 -; SI-NEXT: v_bfi_b32 v1, s7, v0, v1 -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_ge_f64_e64 s[12:13], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], s3 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_b32 s8, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s8, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: s_cselect_b32 s7, s5, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_add_f64 v[2:3], s[4:5], -v[2:3] +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_bfi_b32 v1, s7, v1, v4 -; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_add_f64 v[2:3], s[10:11], v[0:1] +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfi_b32 v1, s12, v1, v4 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -233,151 +232,151 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s14 -; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s13, 0xfffff +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_brev_b32 s18, -2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s12, s7, 0xb0014 -; SI-NEXT: s_add_i32 s16, s12, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[12:13], s[0:1], s16 -; SI-NEXT: s_and_b32 s15, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[12:13], s[6:7], s[12:13] -; SI-NEXT: s_cmp_lt_i32 s16, 0 -; SI-NEXT: s_cselect_b32 s12, 0, s12 -; SI-NEXT: s_cselect_b32 s13, s15, s13 -; SI-NEXT: s_cmp_gt_i32 s16, 51 -; SI-NEXT: s_cselect_b32 s16, s6, s12 -; SI-NEXT: s_cselect_b32 s17, s7, s13 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], s3 +; SI-NEXT: s_and_b32 s16, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[14:15], s[6:7], s[14:15] +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s14, 0, s14 +; SI-NEXT: s_cselect_b32 s15, s16, s15 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s14, s6, s14 +; SI-NEXT: s_cselect_b32 s15, s7, s15 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_b32 s16, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s16, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 +; SI-NEXT: s_cselect_b32 s7, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[2:3], s[18:19], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[0:1], s6 -; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s7, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s6, 0 -; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, s7, s3 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: s_brev_b32 s15, -2 -; SI-NEXT: s_cselect_b32 s2, s4, s2 -; SI-NEXT: v_bfi_b32 v5, s15, v0, v1 -; SI-NEXT: s_cselect_b32 s3, s5, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[14:15], v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 +; SI-NEXT: s_and_b64 s[14:15], s[16:17], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_and_b32 s7, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: s_and_b32 s14, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s7, s5 -; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_cselect_b32 s5, s14, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s15, v5, v6 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 -; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[6:7], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s2 -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s2, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s3, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s2, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s3, s1 -; SI-NEXT: s_cmp_gt_i32 s2, 51 -; SI-NEXT: s_cselect_b32 s1, s9, s1 -; SI-NEXT: s_cselect_b32 s0, s8, s0 -; SI-NEXT: v_mov_b32_e32 v6, s1 -; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 +; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[4:5] +; SI-NEXT: s_and_b64 s[6:7], s[14:15], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 +; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s10, s7 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s6, s8, s6 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v6, s7 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], -v[5:6] ; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v5, s15, v8, v9 -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s2 +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v8, s9 -; SI-NEXT: v_bfi_b32 v5, s15, v5, v8 -; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[4:5] -; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 +; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; CI-NEXT: s_brev_b32 s14, -2 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s14, v8, v5 +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s5 -; CI-NEXT: v_bfi_b32 v5, s14, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[10:11] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[6:7]|, 0.5 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s14, v5, v12 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v8, s9 -; CI-NEXT: v_bfi_b32 v5, s14, v5, v8 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v8 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] -; CI-NEXT: s_mov_b32 s14, -1 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, ptr addrspace(1) %out @@ -387,125 +386,124 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) #0 { ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s22 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s21, 0xfffff +; SI-NEXT: s_mov_b32 s20, s2 ; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s20, s7, 0xb0014 -; SI-NEXT: s_add_i32 s24, s20, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[20:21], s[0:1], s24 -; SI-NEXT: s_and_b32 s23, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[20:21], s[6:7], s[20:21] -; SI-NEXT: s_cmp_lt_i32 s24, 0 -; SI-NEXT: s_cselect_b32 s20, 0, s20 -; SI-NEXT: s_cselect_b32 s21, s23, s21 -; SI-NEXT: s_cmp_gt_i32 s24, 51 -; SI-NEXT: s_cselect_b32 s24, s6, s20 -; SI-NEXT: s_cselect_b32 s25, s7, s21 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], s3 +; SI-NEXT: s_and_b32 s24, s7, 0x80000000 +; SI-NEXT: s_andn2_b64 s[22:23], s[6:7], s[22:23] +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s22, 0, s22 +; SI-NEXT: s_cselect_b32 s23, s24, s23 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s22, s6, s22 +; SI-NEXT: s_cselect_b32 s23, s7, s23 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b64 s[2:3], s[26:27], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; SI-NEXT: s_and_b64 s[24:25], s[24:25], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 +; SI-NEXT: s_add_i32 s24, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s24 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s23, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s25, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s24, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s23, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_cselect_b32 s7, s25, s7 +; SI-NEXT: s_cmp_gt_i32 s24, 51 ; SI-NEXT: s_cselect_b32 s6, s4, s6 -; SI-NEXT: v_bfi_b32 v9, s2, v0, v1 +; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 ; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[8:9] +; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: s_and_b64 s[24:25], s[26:27], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 +; SI-NEXT: s_and_b64 s[22:23], s[24:25], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 +; SI-NEXT: s_add_i32 s22, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s22 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] ; SI-NEXT: s_and_b32 s23, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cmp_lt_i32 s22, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s23, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cmp_gt_i32 s22, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] -; SI-NEXT: v_bfi_b32 v9, s2, v4, v5 -; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 +; SI-NEXT: v_cmp_ge_f64_e64 s[22:23], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] -; SI-NEXT: s_and_b64 s[6:7], s[24:25], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v6, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; SI-NEXT: s_and_b64 s[6:7], s[22:23], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_bfe_u32 s6, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] -; SI-NEXT: s_and_b32 s10, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s11, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s10, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s11, s7 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s6, s8, s6 ; SI-NEXT: s_cselect_b32 s7, s9, s7 ; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 ; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v6, v7 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] ; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_bfe_u32 s4, s15, 0xb0014 +; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 +; SI-NEXT: v_mov_b32_e32 v10, s9 ; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_and_b32 s8, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s4, s14, s4 ; SI-NEXT: s_cselect_b32 s5, s15, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[14:15], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v10 ; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[8:9] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v12, s3 -; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: s_bfe_u32 s6, s13, 0xb0014 +; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 ; SI-NEXT: s_andn2_b64 s[6:7], s[12:13], s[6:7] -; SI-NEXT: s_and_b32 s8, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s8, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s7, s13, s7 ; SI-NEXT: s_cselect_b32 s6, s12, s6 ; SI-NEXT: v_mov_b32_e32 v10, s7 @@ -513,20 +511,20 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v13, s15 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v12, v13 +; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[4:5], v[8:9] ; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v14, s3 -; SI-NEXT: s_bfe_u32 s3, s19, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[0:1], s3 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_bfe_u32 s4, s19, 0xb0014 +; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 ; SI-NEXT: s_andn2_b64 s[4:5], s[18:19], s[4:5] -; SI-NEXT: s_and_b32 s8, s19, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s5, s19, s5 ; SI-NEXT: s_cselect_b32 s4, s18, s4 ; SI-NEXT: v_mov_b32_e32 v10, s5 @@ -534,128 +532,129 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v15, s13 ; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v14, v15 +; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[6:7], v[8:9] ; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s17, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; SI-NEXT: s_andn2_b64 s[0:1], s[16:17], s[0:1] -; SI-NEXT: s_and_b32 s6, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s6, s1 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s1, s17, s1 -; SI-NEXT: s_cselect_b32 s0, s16, s0 -; SI-NEXT: v_mov_b32_e32 v15, s1 -; SI-NEXT: v_mov_b32_e32 v14, s0 +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: s_bfe_u32 s6, s17, 0xb0014 +; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 +; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[6:7] +; SI-NEXT: s_and_b32 s9, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s7, s17, s7 +; SI-NEXT: s_cselect_b32 s6, s16, s6 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v14, s6 ; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s19 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v16 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[14:15]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 ; SI-NEXT: v_add_f64 v[16:17], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 +; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: v_bfi_b32 v9, s2, v9, v14 -; SI-NEXT: v_add_f64 v[14:15], s[0:1], v[8:9] -; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v14 +; SI-NEXT: v_add_f64 v[14:15], s[6:7], v[8:9] +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; CI-NEXT: s_brev_b32 s22, -2 +; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; CI-NEXT: s_mov_b32 s23, 0xf000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s7 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_bfi_b32 v5, s22, v8, v5 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s5 ; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] -; CI-NEXT: v_bfi_b32 v5, s22, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[10:11], s[8:9], -v[6:7] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] ; CI-NEXT: v_mov_b32_e32 v12, s11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s22, v5, v12 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], s[14:15], -v[10:11] ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v14, s9 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[12:13], s[12:13], -v[14:15] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v16, s15 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v16 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v18, s13 ; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[16:17] -; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 ; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[18:19], s[16:17], -v[14:15] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v20, s19 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_bfi_b32 v5, s22, v5, v20 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v18, s17 -; CI-NEXT: v_bfi_b32 v5, s22, v5, v18 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 ; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] -; CI-NEXT: s_mov_b32 s22, -1 -; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index d5b4f879bf8a02..7ad7cc821c1b56 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -8,8 +8,8 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-LABEL: round_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s6, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s6, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -24,39 +24,57 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s6, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] -; GFX89-NEXT: s_brev_b32 s4, -2 -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s2 +; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s4 +; GFX11-NEXT: v_trunc_f32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -91,7 +109,7 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 { ; GFX6-LABEL: round_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -117,7 +135,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX89-LABEL: round_v2f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_brev_b32 s8, -2 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 @@ -143,7 +161,7 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s3 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2 @@ -198,8 +216,8 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 { ; GFX6-LABEL: round_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -235,50 +253,89 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_v4f32: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s10, -2 -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s7 -; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] -; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v2 -; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s5 -; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s10, v1, v4 -; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s4 -; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] -; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s10, v4, v5 -; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_brev_b32 s10, -2 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f32_e32 v0, s7 +; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s5 +; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s4 +; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_brev_b32 s2, -2 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s5 +; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s4 +; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -355,145 +412,145 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 { ; GFX6-LABEL: round_v8f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; GFX6-NEXT: s_brev_b32 s14, -2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_trunc_f32_e32 v0, s7 ; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s6 ; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s5 ; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v4, s5 -; GFX6-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v4 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX6-NEXT: v_trunc_f32_e32 v0, s4 ; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX6-NEXT: v_bfi_b32 v4, s14, v4, v5 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, s11 ; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v6, s11 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s10 ; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v6, s10 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s9 ; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v8 ; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v4, s8 ; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] ; GFX6-NEXT: v_mov_b32_e32 v9, s8 -; GFX6-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX6-NEXT: v_bfi_b32 v8, s14, v8, v9 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX89-LABEL: round_v8f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 -; GFX89-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GFX89-NEXT: s_brev_b32 s2, -2 -; GFX89-NEXT: s_mov_b32 s15, 0xf000 -; GFX89-NEXT: s_mov_b32 s14, -1 +; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX89-NEXT: s_brev_b32 s14, -2 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_trunc_f32_e32 v0, s7 ; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX89-NEXT: v_mov_b32_e32 v2, s7 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s6 ; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] ; GFX89-NEXT: v_mov_b32_e32 v2, s6 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s5 ; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] ; GFX89-NEXT: v_mov_b32_e32 v4, s5 -; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4 ; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX89-NEXT: v_trunc_f32_e32 v0, s4 ; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] ; GFX89-NEXT: v_mov_b32_e32 v5, s4 -; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5 ; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX89-NEXT: v_trunc_f32_e32 v4, s11 ; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v6, s11 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v7, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s10 ; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v6, s10 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 ; GFX89-NEXT: v_add_f32_e32 v6, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s9 ; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v8, s9 -; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8 ; GFX89-NEXT: v_add_f32_e32 v5, v4, v5 ; GFX89-NEXT: v_trunc_f32_e32 v4, s8 ; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1] +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] ; GFX89-NEXT: v_mov_b32_e32 v9, s8 -; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9 +; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9 ; GFX89-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s7 @@ -628,10 +685,10 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-LABEL: round_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX6-NEXT: v_trunc_f32_e32 v1, v0 ; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 @@ -642,44 +699,62 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX89-LABEL: round_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX89-NEXT: s_movk_i32 s5, 0x7fff -; GFX89-NEXT: s_mov_b32 s3, 0xf000 -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f16_e32 v1, s4 -; GFX89-NEXT: v_sub_f16_e32 v2, s4, v1 -; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_mov_b32_e32 v2, s4 -; GFX89-NEXT: v_bfi_b32 v0, s5, v0, v2 -; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX89-NEXT: s_endpgm +; GFX8-LABEL: round_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_trunc_f16_e32 v1, s4 +; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 +; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: round_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e32 v1, s2 +; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1 +; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, s4 +; GFX11-NEXT: v_trunc_f16_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v1, s4, v0 -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v1|, 0.5 +; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -723,13 +798,13 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-LABEL: round_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX6-NEXT: v_trunc_f32_e32 v3, v1 ; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v0 @@ -748,14 +823,13 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX8-NEXT: s_movk_i32 s6, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -782,57 +856,57 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX9-LABEL: round_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_movk_i32 s1, 0x7fff +; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: v_trunc_f16_e32 v1, s5 -; GFX9-NEXT: v_sub_f16_e32 v2, s5, v1 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_trunc_f16_e32 v1, s0 +; GFX9-NEXT: v_sub_f16_e32 v2, s0, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_bfi_b32 v2, s6, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_bfi_b32 v2, s1, v2, v3 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 -; GFX9-NEXT: v_trunc_f16_e32 v2, s4 -; GFX9-NEXT: v_sub_f16_e32 v3, s4, v2 +; GFX9-NEXT: v_trunc_f16_e32 v2, s2 +; GFX9-NEXT: v_sub_f16_e32 v3, s2, v2 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_bfi_b32 v0, s6, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s5, s4, 16 -; GFX11-NEXT: v_trunc_f16_e32 v1, s4 -; GFX11-NEXT: v_trunc_f16_e32 v0, s5 +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_trunc_f16_e32 v1, s2 +; GFX11-NEXT: v_trunc_f16_e32 v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v3, s4, v1 -; GFX11-NEXT: v_sub_f16_e32 v2, s5, v0 +; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 +; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s2 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 s2, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s5 +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll index 70f15bd0aa6131..6a9c4c8d41c202 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll @@ -64,7 +64,7 @@ define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) { define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; GFX6-LABEL: s_set_rounding_kernel: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX6-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX6-NEXT: ;;#ASMSTART @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX7-LABEL: s_set_rounding_kernel: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s2, s[2:3], 0x9 +; GFX7-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX7-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX7-NEXT: ;;#ASMSTART @@ -94,7 +94,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX8-LABEL: s_set_rounding_kernel: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX8-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX8-NEXT: ;;#ASMSTART @@ -109,7 +109,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX9-LABEL: s_set_rounding_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s0, 0x1c84a50f ; GFX9-NEXT: s_mov_b32 s1, 0xb73e62d9 ; GFX9-NEXT: ;;#ASMSTART @@ -124,7 +124,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX10-LABEL: s_set_rounding_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -139,7 +139,7 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) { ; ; GFX11-LABEL: s_set_rounding_kernel: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index a70f4d8d900650..2ce0a628686ea0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -30,7 +30,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -46,7 +46,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -58,7 +58,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: sin_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -70,7 +70,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: sin_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -91,7 +91,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -121,7 +121,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX8-LABEL: sin_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -142,7 +142,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX9-LABEL: sin_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX10-LABEL: sin_v2f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -174,7 +174,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX11-LABEL: sin_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index c69ebedbec50b5..f2d57ba902e735 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) define amdgpu_kernel void @sqrt_f16( ; SI-LABEL: sqrt_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; VI-LABEL: sqrt_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @sqrt_f16( ; ; GFX11-LABEL: sqrt_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -83,7 +83,7 @@ entry: define amdgpu_kernel void @sqrt_v2f16( ; SI-LABEL: sqrt_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; VI-LABEL: sqrt_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -129,7 +129,7 @@ define amdgpu_kernel void @sqrt_v2f16( ; ; GFX11-LABEL: sqrt_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 11f5e6ebf99980..d1e2ddcdc6eacf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -9,7 +9,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) define amdgpu_kernel void @trunc_f16( ; SI-LABEL: trunc_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @trunc_f16( ; ; VI-LABEL: trunc_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -47,7 +47,7 @@ define amdgpu_kernel void @trunc_f16( ; ; GFX11-LABEL: trunc_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -84,7 +84,7 @@ entry: define amdgpu_kernel void @trunc_v2f16( ; SI-LABEL: trunc_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; VI-LABEL: trunc_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @trunc_v2f16( ; ; GFX11-LABEL: trunc_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll index 029c4e51e29934..7c5ab1790c548c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-LABEL: constant_load_v8f32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s16, s[10:11], 0x0 ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocaptur ; ; GFX12-LABEL: constant_load_v8f32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 7202ab8b314669..cfaefca3a516d7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_f64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_f64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -47,7 +47,7 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_f64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -68,7 +68,7 @@ attributes #0 = { nounwind } define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-NOHSA-LABEL: constant_load_2v4f64: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[24:25], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -92,7 +92,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -114,7 +114,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX8-NOHSA-LABEL: constant_load_2v4f64: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX12-LABEL: constant_load_2v4f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 7178eaf2e73846..04fba9ef6d86df 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: constant_load_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v2i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -101,7 +101,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v2i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -140,7 +140,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -157,7 +157,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v3i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -174,7 +174,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v3i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -212,7 +212,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -229,7 +229,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v4i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v4i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -285,7 +285,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -302,7 +302,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v8i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -319,7 +319,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-LABEL: constant_load_v8i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -358,7 +358,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -375,7 +375,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v16i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -392,7 +392,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v16i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -431,7 +431,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -448,7 +448,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v32i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -460,7 +460,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v32i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -488,7 +488,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v32i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -505,7 +505,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_v64i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -518,7 +518,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v64i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -547,7 +547,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v64i1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -565,7 +565,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -582,7 +582,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -611,7 +611,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -629,7 +629,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -647,7 +647,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -678,7 +678,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -698,7 +698,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -715,7 +715,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -744,7 +744,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -762,7 +762,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -780,7 +780,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -811,7 +811,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -831,7 +831,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -850,7 +850,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -884,7 +884,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] @@ -907,7 +907,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -926,7 +926,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -961,7 +961,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v2, s[2:3] @@ -983,7 +983,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] @@ -1073,7 +1073,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1094,7 +1094,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v3, s[2:3] @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -1257,7 +1257,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1278,7 +1278,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1323,7 +1323,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1376,7 +1376,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1443,7 +1443,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -1481,7 +1481,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1507,7 +1507,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1649,7 +1649,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] @@ -1827,7 +1827,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -1863,7 +1863,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -1990,7 +1990,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v16, s[2:3] @@ -2043,7 +2043,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2132,7 +2132,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2349,7 +2349,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2441,7 +2441,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -2530,7 +2530,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2770,7 +2770,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2859,7 +2859,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3614,7 +3614,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -3780,7 +3780,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4241,7 +4241,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4400,7 +4400,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4419,7 +4419,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_zextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4451,7 +4451,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4471,7 +4471,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_i1_to_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4490,7 +4490,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: constant_sextload_i1_to_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4523,7 +4523,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i1_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4543,7 +4543,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4562,7 +4562,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4594,7 +4594,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4614,7 +4614,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4633,7 +4633,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4666,7 +4666,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4686,7 +4686,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4707,7 +4707,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4745,7 +4745,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -4768,7 +4768,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4790,7 +4790,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -4829,7 +4829,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v4, s[2:3] @@ -4854,7 +4854,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4878,7 +4878,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4931,7 +4931,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v5, s[2:3] @@ -4960,7 +4960,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -4986,7 +4986,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5041,7 +5041,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v6, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v6, s[2:3] @@ -5072,7 +5072,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5192,7 +5192,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -5221,7 +5221,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5282,7 +5282,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v8, s[2:3] @@ -5317,7 +5317,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5353,7 +5353,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5453,7 +5453,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5494,7 +5494,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5537,7 +5537,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX8-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -5642,7 +5642,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v16, s[2:3] @@ -5687,7 +5687,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -5746,7 +5746,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v6, v2 ; GFX8-NEXT: v_mov_b32_e32 v8, v2 @@ -5930,7 +5930,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5997,7 +5997,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s10, s2 @@ -6069,7 +6069,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -6263,7 +6263,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v32, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v32, s[2:3] @@ -6336,7 +6336,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -6443,7 +6443,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6779,7 +6779,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6898,7 +6898,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -7063,7 +7063,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -7444,7 +7444,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7574,7 +7574,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7777,7 +7777,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -8428,7 +8428,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8645,7 +8645,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -8968,7 +8968,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0 ; GFX8-NEXT: s_mov_b32 s13, s7 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -9715,7 +9715,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s19, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s5, s19 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 355c296d122ff2..a015a39a7184fc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -38,7 +38,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: constant_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -77,7 +77,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -95,7 +95,7 @@ entry: define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s4, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -107,7 +107,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -119,7 +119,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -147,7 +147,7 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -165,7 +165,7 @@ entry: define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -180,7 +180,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -198,7 +198,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4 @@ -252,7 +252,7 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -273,7 +273,7 @@ entry: define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -286,7 +286,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -299,7 +299,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -328,7 +328,7 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -347,7 +347,7 @@ entry: define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -362,7 +362,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GCN-NOHSA-VI-LABEL: constant_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -408,7 +408,7 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -428,7 +428,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -449,7 +449,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -473,7 +473,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s8, 16 @@ -522,7 +522,7 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i16: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -546,7 +546,7 @@ entry: define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #0 { ; GCN-NOHSA-SI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -590,7 +590,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -608,7 +608,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 @@ -742,7 +742,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; ; GFX12-LABEL: constant_load_v16i16_align2: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0xf @@ -778,7 +778,7 @@ entry: define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -795,7 +795,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -808,7 +808,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -837,7 +837,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -855,7 +855,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -872,7 +872,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -885,7 +885,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -915,7 +915,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -933,7 +933,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -950,7 +950,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -963,7 +963,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -992,7 +992,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1010,7 +1010,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1027,7 +1027,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1040,7 +1040,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1103,7 +1103,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1118,7 +1118,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1175,7 +1175,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1190,7 +1190,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1205,7 +1205,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1240,7 +1240,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1262,7 +1262,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1281,7 +1281,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1361,7 +1361,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1380,7 +1380,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 @@ -1440,7 +1440,7 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v3i16_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1465,7 +1465,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1484,7 +1484,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1503,7 +1503,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1545,7 +1545,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1611,7 +1611,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1655,7 +1655,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1682,7 +1682,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1842,7 +1842,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -1871,7 +1871,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1903,7 +1903,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2001,7 +2001,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2050,7 +2050,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2108,7 +2108,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2265,7 +2265,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -2314,7 +2314,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2372,7 +2372,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2487,7 +2487,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2533,7 +2533,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -2622,7 +2622,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2732,7 +2732,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2937,7 +2937,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3013,7 +3013,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -3102,7 +3102,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3212,7 +3212,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3427,7 +3427,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3503,7 +3503,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 @@ -3672,7 +3672,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3888,7 +3888,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4290,7 +4290,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x0 @@ -4426,7 +4426,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 @@ -4595,7 +4595,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4811,7 +4811,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 @@ -5229,7 +5229,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v64i16_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x40 @@ -5365,7 +5365,7 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5383,7 +5383,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5397,7 +5397,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5429,7 +5429,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5453,7 +5453,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5471,7 +5471,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5485,7 +5485,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5519,7 +5519,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i16_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5540,7 +5540,7 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5558,7 +5558,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5572,7 +5572,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5604,7 +5604,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5623,7 +5623,7 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5641,7 +5641,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5655,7 +5655,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 @@ -5689,7 +5689,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i16_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -5710,7 +5710,7 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5727,7 +5727,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5744,7 +5744,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -5782,7 +5782,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5805,7 +5805,7 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5823,7 +5823,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -5841,7 +5841,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -5882,7 +5882,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i16_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5905,7 +5905,7 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -5928,7 +5928,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5954,7 +5954,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6009,7 +6009,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6037,7 +6037,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6064,7 +6064,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6094,7 +6094,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6156,7 +6156,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6187,7 +6187,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6222,7 +6222,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6266,7 +6266,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6357,7 +6357,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6394,7 +6394,7 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6438,7 +6438,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6492,7 +6492,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6598,7 +6598,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6642,7 +6642,7 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 @@ -6701,7 +6701,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6781,7 +6781,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -6945,7 +6945,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7003,7 +7003,7 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -7081,7 +7081,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7187,7 +7187,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7381,7 +7381,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7452,7 +7452,7 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 @@ -7559,7 +7559,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7711,7 +7711,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -8026,7 +8026,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8124,7 +8124,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -8272,7 +8272,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8476,7 +8476,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -8854,7 +8854,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index f1a6bccc559f04..b0d8f72c22ba7a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -22,7 +22,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-NOHSA-LABEL: constant_load_i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-HSA-LABEL: constant_load_i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -73,7 +73,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -91,7 +91,7 @@ entry: define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -104,7 +104,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v2i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -146,7 +146,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v2i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -158,7 +158,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -177,7 +177,7 @@ entry: define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -193,7 +193,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -207,7 +207,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -242,7 +242,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v3i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -255,7 +255,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -274,7 +274,7 @@ entry: define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -289,7 +289,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -304,7 +304,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v4i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -335,7 +335,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v4i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -349,7 +349,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -369,7 +369,7 @@ entry: define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -390,7 +390,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -414,7 +414,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v8i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 16 @@ -458,7 +458,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v8i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -477,7 +477,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -501,7 +501,7 @@ entry: define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v9i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -526,7 +526,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v9i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s12, s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -614,7 +614,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-HSA-LABEL: constant_load_v9i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s12, s[10:11], 0x20 @@ -636,7 +636,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v9i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s12, s[10:11], 0x20 @@ -663,7 +663,7 @@ entry: define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v10i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -689,7 +689,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -721,7 +721,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v10i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -780,7 +780,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v10i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x20 @@ -803,7 +803,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v10i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 @@ -831,7 +831,7 @@ entry: define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v11i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -860,7 +860,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -893,7 +893,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -958,7 +958,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v11i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -982,7 +982,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v11i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20 @@ -1010,7 +1010,7 @@ entry: define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v12i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1038,7 +1038,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v12i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v12i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 @@ -1158,7 +1158,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v12i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 @@ -1187,7 +1187,7 @@ entry: define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -1220,7 +1220,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1262,7 +1262,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v16i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_add_u32 s18, s16, 48 @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v16i32: ; GFX9-HSA: ; %bb.0: ; %entry -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1395,7 +1395,7 @@ entry: define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1408,7 +1408,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1421,7 +1421,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_zextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1451,7 +1451,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1462,7 +1462,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_zextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1508,7 +1508,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX8-NOHSA-LABEL: constant_sextload_i32_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1539,7 +1539,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX9-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1552,7 +1552,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX12-LABEL: constant_sextload_i32_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1586,7 +1586,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1599,7 +1599,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1629,7 +1629,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1640,7 +1640,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1658,7 +1658,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1730,7 +1730,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1751,7 +1751,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1766,7 +1766,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1781,7 +1781,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,7 +1816,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,7 +1829,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1849,7 +1849,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1866,7 +1866,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1884,7 +1884,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -1940,7 +1940,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1963,7 +1963,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2004,7 +2004,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2053,7 +2053,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2070,7 +2070,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2093,7 +2093,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2117,7 +2117,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2145,7 +2145,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2203,7 +2203,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2281,7 +2281,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2317,7 +2317,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2399,7 +2399,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2424,7 +2424,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2453,7 +2453,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -2491,7 +2491,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2543,7 +2543,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2644,7 +2644,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX9-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 @@ -2683,7 +2683,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX12-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2721,7 +2721,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -2788,7 +2788,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2888,7 +2888,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3075,7 +3075,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 @@ -3142,7 +3142,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3201,7 +3201,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 @@ -3244,7 +3244,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3314,7 +3314,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3462,7 +3462,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3503,7 +3503,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3544,7 +3544,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -3680,7 +3680,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3870,7 +3870,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4231,7 +4231,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -4355,7 +4355,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[0:15], s[38:39], 0x0 @@ -4459,7 +4459,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -4537,7 +4537,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4672,7 +4672,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4957,7 +4957,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX9-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5031,7 +5031,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 @@ -5098,7 +5098,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 @@ -5158,7 +5158,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5241,7 +5241,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5375,7 +5375,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX9-HSA-LABEL: constant_load_v32i32: ; GFX9-HSA: ; %bb.0: -; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX9-HSA-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 @@ -5426,7 +5426,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 46c7c2b08cd64b..66c73fda38743f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -21,7 +21,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -34,7 +34,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: constant_load_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -63,7 +63,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; ; GFX12-LABEL: constant_load_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v2i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -96,7 +96,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -111,7 +111,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v2i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v2i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -162,7 +162,7 @@ entry: define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v3i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -182,7 +182,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -205,7 +205,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v3i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -253,7 +253,7 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v3i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10 @@ -278,7 +278,7 @@ entry: define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v4i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 @@ -299,7 +299,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_add_u32 s10, s8, 16 @@ -323,7 +323,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v4i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NEXT: s_add_u32 s10, s8, 16 @@ -372,7 +372,7 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v4i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -396,7 +396,7 @@ entry: define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v8i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NEXT: s_mov_b32 s19, 0xf000 @@ -429,7 +429,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -471,7 +471,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-LABEL: constant_load_v8i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NEXT: s_add_u32 s18, s16, 48 @@ -558,7 +558,7 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v8i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -588,7 +588,7 @@ entry: define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-LABEL: constant_load_v16i64: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 ; GFX6-NEXT: s_mov_b32 s39, 0xf000 @@ -648,7 +648,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -731,7 +731,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: constant_load_v16i64: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -899,7 +899,7 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX12-LABEL: constant_load_v16i64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 67a376b8c0f3c5..9000cee7ef9df0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-NOHSA-LABEL: constant_load_i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -78,7 +78,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: constant_load_i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -96,7 +96,7 @@ entry: define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v2i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -113,7 +113,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -126,7 +126,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v2i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -165,7 +165,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v2i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -183,7 +183,7 @@ entry: define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v3i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -198,7 +198,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -217,7 +217,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v3i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -278,7 +278,7 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v3i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -298,7 +298,7 @@ entry: define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v4i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -310,7 +310,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -322,7 +322,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v4i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -350,7 +350,7 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v4i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -368,7 +368,7 @@ entry: define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v8i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -381,7 +381,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -394,7 +394,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX8-NOHSA-LABEL: constant_load_v8i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -423,7 +423,7 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: constant_load_v8i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -442,7 +442,7 @@ entry: define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_v16i8: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -457,7 +457,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -472,7 +472,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX8-NOHSA-LABEL: constant_load_v16i8: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -503,7 +503,7 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; ; GFX12-LABEL: constant_load_v16i8: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -523,7 +523,7 @@ entry: define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -540,7 +540,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -553,7 +553,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -582,7 +582,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -617,7 +617,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -660,7 +660,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -678,7 +678,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -695,7 +695,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -708,7 +708,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -737,7 +737,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -755,7 +755,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -772,7 +772,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -785,7 +785,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -815,7 +815,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -834,7 +834,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -853,7 +853,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -868,7 +868,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -911,7 +911,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -934,7 +934,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -953,7 +953,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -968,7 +968,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] @@ -1034,7 +1034,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1070,7 +1070,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1112,7 +1112,7 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1137,7 +1137,7 @@ entry: define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1156,7 +1156,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1173,7 +1173,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1242,7 +1242,7 @@ entry: define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1261,7 +1261,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1280,7 +1280,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1322,7 +1322,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1367,7 +1367,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1386,7 +1386,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1431,7 +1431,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1457,7 +1457,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1518,7 +1518,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1616,7 +1616,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1677,7 +1677,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 @@ -1747,7 +1747,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -1782,7 +1782,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -1831,7 +1831,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1889,7 +1889,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1999,7 +1999,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2153,7 +2153,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2275,7 +2275,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2415,7 +2415,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2525,7 +2525,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2724,7 +2724,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -2802,7 +2802,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -2891,7 +2891,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3001,7 +3001,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3231,7 +3231,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -3315,7 +3315,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3483,7 +3483,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3699,7 +3699,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4086,7 +4086,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -4222,7 +4222,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4390,7 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4604,7 +4604,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5047,7 +5047,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[16:19], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5195,7 +5195,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5213,7 +5213,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5227,7 +5227,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -5259,7 +5259,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v1, s[2:3] @@ -5279,7 +5279,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5297,7 +5297,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5311,7 +5311,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5345,7 +5345,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5366,7 +5366,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5384,7 +5384,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5398,7 +5398,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5429,7 +5429,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5448,7 +5448,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5466,7 +5466,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5480,7 +5480,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5514,7 +5514,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v0, v2, s[2:3] @@ -5535,7 +5535,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5556,7 +5556,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5573,7 +5573,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5620,7 +5620,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] @@ -5642,7 +5642,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -5664,7 +5664,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5682,7 +5682,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5731,7 +5731,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v0, v4, s[2:3] @@ -5756,7 +5756,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5779,7 +5779,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5805,7 +5805,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5863,7 +5863,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -5892,7 +5892,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -5920,7 +5920,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5951,7 +5951,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6014,7 +6014,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6046,7 +6046,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6081,7 +6081,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6125,7 +6125,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6220,7 +6220,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6260,7 +6260,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6305,7 +6305,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6361,7 +6361,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -6472,7 +6472,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s5, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -6518,7 +6518,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6577,7 +6577,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6657,7 +6657,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6826,7 +6826,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -6887,7 +6887,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -6967,7 +6967,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7074,7 +7074,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7275,7 +7275,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -7347,7 +7347,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -7454,7 +7454,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7606,7 +7606,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7934,7 +7934,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8039,7 +8039,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8198,7 +8198,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8406,7 +8406,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8793,7 +8793,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -8932,7 +8932,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -8949,7 +8949,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8962,7 +8962,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9000,7 +9000,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_zextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -9018,7 +9018,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9035,7 +9035,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9048,7 +9048,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9088,7 +9088,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; ; GFX12-LABEL: constant_sextload_i8_to_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9106,7 +9106,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9123,7 +9123,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9136,7 +9136,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9174,7 +9174,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v1, v0, s[2:3] @@ -9192,7 +9192,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9209,7 +9209,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9222,7 +9222,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9262,7 +9262,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_i8 v1, v0, s[2:3] @@ -9280,7 +9280,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9300,7 +9300,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9316,7 +9316,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -9356,7 +9356,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9379,7 +9379,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s6, -1 ; GFX6-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -9401,7 +9401,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9419,7 +9419,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -9469,7 +9469,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] @@ -9492,7 +9492,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9512,7 +9512,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9532,7 +9532,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9598,7 +9598,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -9627,7 +9627,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9650,7 +9650,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9673,7 +9673,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9749,7 +9749,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9777,7 +9777,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -9806,7 +9806,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9835,7 +9835,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9940,7 +9940,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -9976,7 +9976,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10011,7 +10011,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10046,7 +10046,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10171,7 +10171,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10206,7 +10206,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10254,7 +10254,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10306,7 +10306,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10501,7 +10501,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10556,7 +10556,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 @@ -10617,7 +10617,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10681,7 +10681,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10912,7 +10912,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -10968,7 +10968,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -11054,7 +11054,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11152,7 +11152,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11523,7 +11523,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -11616,7 +11616,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 @@ -11729,7 +11729,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11851,7 +11851,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) @@ -12297,7 +12297,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 69c7c5cf81ab55..4dd7a03659f72f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -40,7 +40,7 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-NOHSA-VI-LABEL: global_load_i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -115,7 +115,7 @@ entry: define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v2i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -132,7 +132,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -145,7 +145,7 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v2i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -200,7 +200,7 @@ entry: define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v3i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -218,7 +218,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v3i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -332,7 +332,7 @@ entry: define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v4i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -349,7 +349,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -362,7 +362,7 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v4i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -417,7 +417,7 @@ entry: define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v8i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -434,7 +434,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -447,7 +447,7 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; ; GCN-NOHSA-VI-LABEL: global_load_v8i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -502,7 +502,7 @@ entry: define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_load_v16i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -522,7 +522,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -546,7 +546,7 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -622,7 +622,7 @@ entry: define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s10 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -696,7 +696,7 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; ; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -804,7 +804,7 @@ entry: define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -821,7 +821,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -834,7 +834,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -889,7 +889,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -906,7 +906,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -919,7 +919,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -977,7 +977,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -994,7 +994,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1007,7 +1007,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1062,7 +1062,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1079,7 +1079,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1092,7 +1092,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1150,7 +1150,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1184,7 +1184,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1268,7 +1268,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1283,7 +1283,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1348,7 +1348,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1369,7 +1369,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1385,7 +1385,7 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1458,7 +1458,7 @@ entry: define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1479,7 +1479,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1495,7 +1495,7 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1575,7 +1575,7 @@ entry: define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1596,7 +1596,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1613,7 +1613,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1689,7 +1689,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1711,7 +1711,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1729,7 +1729,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1833,7 +1833,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1859,7 +1859,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -1982,7 +1982,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2008,7 +2008,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -2108,7 +2108,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2146,7 +2146,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2344,7 +2344,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2382,7 +2382,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2432,7 +2432,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -2591,7 +2591,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -2653,7 +2653,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2751,7 +2751,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3002,7 +3002,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3064,7 +3064,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3162,7 +3162,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -3450,9 +3450,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -3583,7 +3583,7 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3776,14 +3776,14 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -4265,13 +4265,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN-NOHSA-SI-NEXT: s_add_u32 s8, s8, s3 +; GCN-NOHSA-SI-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -4294,10 +4294,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 @@ -4377,17 +4377,17 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4580,14 +4580,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s9 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 @@ -5134,7 +5134,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5152,7 +5152,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5166,7 +5166,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5231,7 +5231,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5249,7 +5249,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5263,7 +5263,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5326,7 +5326,7 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5344,7 +5344,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5358,7 +5358,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5418,7 +5418,7 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5436,7 +5436,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5450,7 +5450,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5513,7 +5513,7 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5534,7 +5534,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5551,7 +5551,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5621,7 +5621,7 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5643,7 +5643,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5661,7 +5661,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5735,7 +5735,7 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5761,7 +5761,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5787,7 +5787,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -5879,7 +5879,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 @@ -5906,7 +5906,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5933,7 +5933,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6 @@ -6030,7 +6030,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6066,7 +6066,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 @@ -6110,7 +6110,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6248,7 +6248,7 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6285,7 +6285,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6330,7 +6330,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6477,7 +6477,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6535,7 +6535,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 @@ -6621,7 +6621,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -6855,7 +6855,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -6915,7 +6915,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7003,7 +7003,7 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7255,9 +7255,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s9 +; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3 ; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, 0 @@ -7383,7 +7383,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -7538,7 +7538,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 @@ -7974,7 +7974,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 @@ -8085,7 +8085,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8259,7 +8259,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 4d7f1a9663c3dc..0e1f470ca72aa5 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -39,7 +39,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCNX3-NOHSA-LABEL: global_load_i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; ; GCN-HSA-LABEL: global_load_i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3] @@ -88,7 +88,7 @@ entry: define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v2i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -105,7 +105,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -118,7 +118,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v2i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -151,7 +151,7 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v2i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -167,7 +167,7 @@ entry: define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v3i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -185,7 +185,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +198,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v3i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -236,7 +236,7 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v3i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3] @@ -252,7 +252,7 @@ entry: define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v4i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -269,7 +269,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v4i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -315,7 +315,7 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v4i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -331,7 +331,7 @@ entry: define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v8i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -351,7 +351,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -375,7 +375,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v8i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -415,7 +415,7 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v8i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -434,7 +434,7 @@ entry: define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v9i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -457,7 +457,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -492,7 +492,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCNX3-NOHSA-LABEL: global_load_v9i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -543,7 +543,7 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; ; GCN-HSA-LABEL: global_load_v9i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -565,7 +565,7 @@ entry: define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v10i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -588,7 +588,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v10i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -672,7 +672,7 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v10i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v10, s[2:3] @@ -694,7 +694,7 @@ entry: define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v11i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -718,7 +718,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -753,7 +753,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v11i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -807,7 +807,7 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v11i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -830,7 +830,7 @@ entry: define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v12i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -853,7 +853,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -888,7 +888,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v12i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -938,7 +938,7 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v12i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] @@ -960,7 +960,7 @@ entry: define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v16i32: ; SI-NOHSA: ; %bb.0: ; %entry -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -986,7 +986,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1032,7 +1032,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v16i32: ; GCNX3-NOHSA: ; %bb.0: ; %entry -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1090,7 +1090,7 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v16i32: ; GCN-HSA: ; %bb.0: ; %entry -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:32 @@ -1115,7 +1115,7 @@ entry: define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1147,7 +1147,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1182,7 +1182,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_zextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1198,7 +1198,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_i32_to_i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1216,7 +1216,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1230,7 +1230,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; ; GCN-HSA-LABEL: global_sextload_i32_to_i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1282,7 +1282,7 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1300,7 +1300,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1314,7 +1314,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] @@ -1365,7 +1365,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1383,7 +1383,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1397,7 +1397,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1432,7 +1432,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1470,7 +1470,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1487,7 +1487,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1528,7 +1528,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] @@ -1547,7 +1547,7 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1567,7 +1567,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1583,7 +1583,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1669,7 +1669,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1694,7 +1694,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1746,7 +1746,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1769,7 +1769,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1795,7 +1795,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1821,7 +1821,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1877,7 +1877,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] @@ -1902,7 +1902,7 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1981,7 +1981,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2059,7 +2059,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2091,7 +2091,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -2129,7 +2129,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2179,7 +2179,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; ; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v23, s[2:3] @@ -2303,7 +2303,7 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2365,7 +2365,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2463,7 +2463,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2613,7 +2613,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 @@ -2674,7 +2674,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2726,7 +2726,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -2941,7 +2941,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2995,9 +2995,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NOHSA-NEXT: s_mov_b32 s14, -1 ; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000 -; SI-NOHSA-NEXT: s_add_u32 s12, s12, s9 +; SI-NOHSA-NEXT: s_add_u32 s12, s12, s3 ; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0 -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3117,7 +3117,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3310,7 +3310,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -3585,12 +3585,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX900-HSA: ; %bb.0: -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s13 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 +; GCN-GFX900-HSA-NEXT: s_add_u32 s8, s8, s7 +; GCN-GFX900-HSA-NEXT: s_addc_u32 s9, s9, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112 @@ -3616,11 +3616,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[8:11], 0 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 @@ -3663,11 +3663,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 @@ -3704,7 +3704,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX908-HSA: ; %bb.0: -; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 @@ -3820,7 +3820,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 ; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4073,7 +4073,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 @@ -4312,7 +4312,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4398,7 +4398,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_v32i32: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s6, -1 ; SI-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4432,7 +4432,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: -; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4524,7 +4524,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCNX3-NOHSA-LABEL: global_load_v32i32: ; GCNX3-NOHSA: ; %bb.0: -; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 ; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 ; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 @@ -4612,7 +4612,7 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; ; GCN-HSA-LABEL: global_load_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll index a71418f3dbf5ba..26b559ae6fa9a9 100644 --- a/llvm/test/CodeGen/AMDGPU/local-64.ll +++ b/llvm/test/CodeGen/AMDGPU/local-64.ll @@ -9,7 +9,7 @@ ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 ; GCN: buffer_store_dword [[REG]], -define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i32, ptr addrspace(3) %in, i32 7 %val = load i32, ptr addrspace(3) %gep, align 4 store i32 %val, ptr addrspace(1) %out, align 4 @@ -22,7 +22,7 @@ define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} ; GCN: buffer_store_dword [[REG]], -define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %val = load i32, ptr addrspace(3) %in, align 4 store i32 %val, ptr addrspace(1) %out, align 4 ret void @@ -35,7 +35,7 @@ define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 ; GCN: buffer_store_byte [[REG]], -define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i8, ptr addrspace(3) %in, i32 65535 %val = load i8, ptr addrspace(3) %gep, align 4 store i8 %val, ptr addrspace(1) %out, align 4 @@ -56,7 +56,7 @@ define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, p ; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] ; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] ; GCN: buffer_store_byte [[REG]], -define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i8, ptr addrspace(3) %in, i32 65536 %val = load i8, ptr addrspace(3) %gep, align 4 store i8 %val, ptr addrspace(1) %out, align 4 @@ -70,7 +70,7 @@ define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %o ; GCN-NOT: add ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr i64, ptr addrspace(3) %in, i32 7 %val = load i64, ptr addrspace(3) %gep, align 8 store i64 %val, ptr addrspace(1) %out, align 8 @@ -83,7 +83,7 @@ define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %val = load i64, ptr addrspace(3) %in, align 8 store i64 %val, ptr addrspace(1) %out, align 8 ret void @@ -96,7 +96,7 @@ define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %gep = getelementptr double, ptr addrspace(3) %in, i32 7 %val = load double, ptr addrspace(3) %gep, align 8 store double %val, ptr addrspace(1) %out, align 8 @@ -109,7 +109,7 @@ define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; GCN: buffer_store_dwordx2 [[REG]], -define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { +define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind { %val = load double, ptr addrspace(3) %in, align 8 store double %val, ptr addrspace(1) %out, align 8 ret void @@ -121,7 +121,7 @@ define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr ad ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr i64, ptr addrspace(3) %out, i32 7 store i64 5678, ptr addrspace(3) %gep, align 8 ret void @@ -133,7 +133,7 @@ define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) #0 { ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) nounwind { store i64 1234, ptr addrspace(3) %out, align 8 ret void } @@ -144,7 +144,7 @@ define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) #0 { ; GCN-NOT: add ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr double, ptr addrspace(3) %out, i32 7 store double 16.0, ptr addrspace(3) %gep, align 8 ret void @@ -155,7 +155,7 @@ define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) #0 { ; GFX9-NOT: m0 ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) nounwind { store double 20.0, ptr addrspace(3) %out, align 8 ret void } @@ -168,7 +168,7 @@ define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) #0 { ; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 ; GCN: s_endpgm -define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr <2 x i64>, ptr addrspace(3) %out, i32 7 store <2 x i64> , ptr addrspace(3) %gep, align 16 ret void @@ -184,7 +184,7 @@ define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) #0 { ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) nounwind { store <2 x i64> , ptr addrspace(3) %out, align 16 ret void } @@ -201,7 +201,7 @@ define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) #0 ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240{{$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) nounwind { %gep = getelementptr <4 x i64>, ptr addrspace(3) %out, i32 7 store <4 x i64> , ptr addrspace(3) %gep, align 16 ret void @@ -219,9 +219,7 @@ define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) #0 { ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16{{$}} ; GCN: s_endpgm -define amdgpu_kernel void @local_v4i64_store_0_offset(ptr addrspace(3) %out) #0 { +define amdgpu_kernel void @local_v4i64_store_0_offset(ptr addrspace(3) %out) nounwind { store <4 x i64> , ptr addrspace(3) %out, align 16 ret void } - -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 65614a17fc0114..87c0c6cfc790ff 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7007,21 +7007,21 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s1, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX12-NEXT: s_add_co_i32 s3, s5, 4 +; GFX12-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB28_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s1, 3 +; GFX12-NEXT: s_lshl_b32 s5, s3, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: global_wb scope:SCOPE_SE @@ -7029,7 +7029,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: v_readfirstlane_b32 s5, v1 @@ -7038,12 +7038,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB28_4 ; GFX12-NEXT: ; %bb.3: -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX12-NEXT: s_lshl_b32 s0, s1, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX12-NEXT: s_lshl_b32 s2, s3, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7051,8 +7051,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: .LBB28_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_brev_b32 s0, 1 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_brev_b32 s2, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7061,33 +7061,33 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB28_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_ctz_i32_b32 s5, s1 +; GFX12-NEXT: s_ctz_i32_b32 s5, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s0, s5 -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: v_writelane_b32 v0, s2, s5 +; GFX12-NEXT: s_and_not1_b32 s3, s3, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 -; GFX12-NEXT: s_add_f32 s0, s0, s6 +; GFX12-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12-NEXT: s_add_f32 s2, s2, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12-NEXT: s_cbranch_execz .LBB28_8 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: global_wb scope:SCOPE_SE ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: .LBB28_8: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7100,10 +7100,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX940-LABEL: local_ds_fadd: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 @@ -7111,9 +7111,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB28_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7124,15 +7124,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX940-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB28_4 ; GFX940-NEXT: ; %bb.3: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX940-NEXT: s_lshl_b32 s0, s5, 4 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: s_lshl_b32 s2, s5, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f32 v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB28_4: @@ -7141,20 +7141,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, s10 -; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB28_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX940-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 ; GFX940-NEXT: v_readlane_b32 s9, v2, s5 ; GFX940-NEXT: s_mov_b32 m0, s5 -; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX940-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 -; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX940-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX940-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX940-NEXT: ; %bb.6: ; %ComputeEnd @@ -7162,16 +7162,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB28_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: .LBB28_8: -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NEXT: s_nop 0 @@ -7184,28 +7184,28 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX11-LABEL: local_ds_fadd: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s1, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_add_i32 s3, s5, 4 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s1, 3 +; GFX11-NEXT: s_lshl_b32 s5, s3, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 @@ -7214,12 +7214,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX11-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-NEXT: ; %bb.3: -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX11-NEXT: s_lshl_b32 s0, s1, 4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_f32 v2, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv @@ -7227,7 +7227,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7236,25 +7236,25 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB28_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: v_readlane_b32 s6, v2, s3 +; GFX11-NEXT: s_lshl_b32 s7, 1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s1 +; GFX11-NEXT: s_and_not1_b32 s2, s2, s7 +; GFX11-NEXT: v_writelane_b32 v0, s5, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB28_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 @@ -7262,8 +7262,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB28_8: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 @@ -7276,19 +7276,19 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX10-LABEL: local_ds_fadd: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s1, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_add_i32 s3, s5, 4 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB28_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s1, 3 +; GFX10-NEXT: s_lshl_b32 s5, s3, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7296,18 +7296,18 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB28_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 +; GFX10-NEXT: s_and_saveexec_b32 s6, s2 ; GFX10-NEXT: s_cbranch_execz .LBB28_4 ; GFX10-NEXT: ; %bb.3: -; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s1, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ds_add_f32 v2, v1 @@ -7318,28 +7318,28 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB28_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_ff1_i32_b32 s1, s0 +; GFX10-NEXT: s_ff1_i32_b32 s3, s2 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s1 +; GFX10-NEXT: v_readlane_b32 s6, v2, s3 +; GFX10-NEXT: s_lshl_b32 s7, 1, s3 +; GFX10-NEXT: s_andn2_b32 s2, s2, s7 +; GFX10-NEXT: v_writelane_b32 v0, s5, s3 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB28_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 @@ -7349,9 +7349,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: .LBB28_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_f32_e32 v0, s2, v0 @@ -7362,10 +7361,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX90A-LABEL: local_ds_fadd: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 @@ -7373,9 +7372,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB28_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7386,15 +7385,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB28_4 ; GFX90A-NEXT: ; %bb.3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX90A-NEXT: s_lshl_b32 s2, s5, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f32 v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_4: @@ -7403,20 +7402,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s10 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b64 s[2:3], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX90A-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 ; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd @@ -7424,16 +7423,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB28_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: .LBB28_8: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 @@ -7445,10 +7444,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX908-LABEL: local_ds_fadd: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 @@ -7456,9 +7455,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB28_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7469,15 +7468,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX908-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB28_4 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX908-NEXT: s_lshl_b32 s0, s5, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX908-NEXT: s_lshl_b32 s2, s5, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 ; GFX908-NEXT: ds_add_f32 v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_4: @@ -7486,20 +7485,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX908-NEXT: v_mov_b32_e32 v1, s10 -; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: s_mov_b64 s[2:3], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB28_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX908-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 ; GFX908-NEXT: v_readlane_b32 s9, v2, s5 ; GFX908-NEXT: s_mov_b32 m0, s5 -; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX908-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd @@ -7507,16 +7506,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB28_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: .LBB28_8: -; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, s2 @@ -7528,10 +7527,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX8-LABEL: local_ds_fadd: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 @@ -7540,9 +7539,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB28_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7553,15 +7552,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB28_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX8-NEXT: s_lshl_b32 s0, s5, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX8-NEXT: s_lshl_b32 s2, s5, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: ds_add_f32 v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_4: @@ -7570,20 +7569,20 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB28_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 ; GFX8-NEXT: v_readlane_b32 s9, v2, s5 ; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB28_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd @@ -7591,8 +7590,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB28_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -7600,8 +7599,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB28_8: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -7614,10 +7613,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX7-LABEL: local_ds_fadd: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX7-NEXT: s_mov_b64 s[0:1], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 @@ -7629,8 +7628,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_2: ; %atomicrmw.start @@ -7640,8 +7639,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_2 ; GFX7-NEXT: ; %bb.3: ; %Flow22 @@ -7652,15 +7651,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_cbranch_execz .LBB28_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshl_b32 s2, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -7669,8 +7668,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_6 @@ -7680,16 +7679,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: s_mov_b64 s[2:3], exec ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7-NEXT: ; implicit-def: $vgpr0 ; GFX7-NEXT: .LBB28_8: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 ; GFX7-NEXT: v_readlane_b32 s9, v2, s5 ; GFX7-NEXT: s_mov_b32 m0, s5 @@ -7702,8 +7701,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7-NEXT: ; implicit-def: $vgpr2 -; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[2:3] ; GFX7-NEXT: s_cbranch_execz .LBB28_13 ; GFX7-NEXT: ; %bb.10: ; GFX7-NEXT: v_mov_b32_e32 v3, s4 @@ -7717,15 +7716,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 ; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v4 +; GFX7-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB28_11 ; GFX7-NEXT: ; %bb.12: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: .LBB28_13: ; %Flow19 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 @@ -7738,10 +7737,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; ; GFX6-LABEL: local_ds_fadd: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 @@ -7753,8 +7752,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_2: ; %atomicrmw.start @@ -7764,8 +7763,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_2 ; GFX6-NEXT: ; %bb.3: ; %Flow20 @@ -7776,15 +7775,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB28_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshl_b32 s2, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB28_6: ; %atomicrmw.start2 @@ -7793,8 +7792,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_6 @@ -7804,16 +7803,16 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: ; implicit-def: $vgpr0 ; GFX6-NEXT: .LBB28_8: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: v_readlane_b32 s9, v2, s5 ; GFX6-NEXT: s_mov_b32 m0, s5 @@ -7826,8 +7825,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX6-NEXT: ; implicit-def: $vgpr2 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB28_13 ; GFX6-NEXT: ; %bb.10: ; GFX6-NEXT: v_mov_b32_e32 v3, s4 @@ -7841,15 +7840,15 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 ; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v4 +; GFX6-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB28_11 ; GFX6-NEXT: ; %bb.12: ; %Flow ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: .LBB28_13: ; %Flow17 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 @@ -7874,26 +7873,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { ; GFX12-LABEL: local_ds_fadd_one_as: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX12-NEXT: s_mov_b32 s6, exec_lo ; GFX12-NEXT: ; implicit-def: $vgpr1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s1, s5, 4 -; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX12-NEXT: s_add_co_i32 s3, s5, 4 +; GFX12-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX12-NEXT: s_cbranch_execz .LBB29_2 ; GFX12-NEXT: ; %bb.1: ; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX12-NEXT: s_lshl_b32 s5, s1, 3 +; GFX12-NEXT: s_lshl_b32 s5, s3, 3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX12-NEXT: .LBB29_2: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s7, exec_lo ; GFX12-NEXT: s_wait_dscnt 0x0 @@ -7903,18 +7902,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX12-NEXT: s_cbranch_execz .LBB29_4 ; GFX12-NEXT: ; %bb.3: -; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX12-NEXT: s_lshl_b32 s0, s1, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX12-NEXT: s_lshl_b32 s2, s3, 4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX12-NEXT: ds_add_f32 v2, v1 ; GFX12-NEXT: .LBB29_4: ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_brev_b32 s0, 1 +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_brev_b32 s2, 1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX12-NEXT: v_add_f32_e32 v0, s5, v0 @@ -7923,30 +7922,30 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: ; implicit-def: $vgpr0 ; GFX12-NEXT: .LBB29_5: ; %ComputeLoop ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_ctz_i32_b32 s5, s1 +; GFX12-NEXT: s_ctz_i32_b32 s5, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_readlane_b32 s6, v1, s5 ; GFX12-NEXT: s_lshl_b32 s7, 1, s5 -; GFX12-NEXT: v_writelane_b32 v0, s0, s5 -; GFX12-NEXT: s_and_not1_b32 s1, s1, s7 +; GFX12-NEXT: v_writelane_b32 v0, s2, s5 +; GFX12-NEXT: s_and_not1_b32 s3, s3, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_lg_u32 s1, 0 -; GFX12-NEXT: s_add_f32 s0, s0, s6 +; GFX12-NEXT: s_cmp_lg_u32 s3, 0 +; GFX12-NEXT: s_add_f32 s2, s2, s6 ; GFX12-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX12-NEXT: ; %bb.6: ; %ComputeEnd ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX12-NEXT: ; implicit-def: $vgpr1 -; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX12-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX12-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX12-NEXT: s_cbranch_execz .LBB29_8 ; GFX12-NEXT: ; %bb.7: -; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 ; GFX12-NEXT: .LBB29_8: -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7960,10 +7959,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX940-LABEL: local_ds_fadd_one_as: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s5, s5, 4 @@ -7971,9 +7970,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX940-NEXT: s_cbranch_execz .LBB29_2 ; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX940-NEXT: s_lshl_b32 s8, s5, 3 -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX940-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -7984,15 +7983,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_readfirstlane_b32 s10, v1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX940-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX940-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB29_4 ; GFX940-NEXT: ; %bb.3: -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX940-NEXT: s_lshl_b32 s0, s5, 4 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: s_lshl_b32 s2, s5, 4 ; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f32 v2, v1 ; GFX940-NEXT: .LBB29_4: ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8000,20 +7999,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX940-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX940-NEXT: v_mov_b32_e32 v1, s10 -; GFX940-NEXT: s_mov_b64 s[0:1], exec +; GFX940-NEXT: s_mov_b64 s[2:3], exec ; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX940-NEXT: ; implicit-def: $vgpr0 ; GFX940-NEXT: .LBB29_5: ; %ComputeLoop ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX940-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX940-NEXT: v_readfirstlane_b32 s8, v1 ; GFX940-NEXT: v_readlane_b32 s9, v2, s5 ; GFX940-NEXT: s_mov_b32 m0, s5 -; GFX940-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX940-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX940-NEXT: v_writelane_b32 v0, s8, m0 -; GFX940-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX940-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX940-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX940-NEXT: ; %bb.6: ; %ComputeEnd @@ -8021,15 +8020,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX940-NEXT: ; implicit-def: $vgpr2 -; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX940-NEXT: s_cbranch_execz .LBB29_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX940-NEXT: .LBB29_8: -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_readfirstlane_b32 s2, v2 ; GFX940-NEXT: v_mov_b32_e32 v1, 0 @@ -8042,26 +8041,26 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX11-LABEL: local_ds_fadd_one_as: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x8 ; GFX11-NEXT: s_mov_b32 s6, exec_lo ; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s1, s5, 4 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_add_i32 s3, s5, 4 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11-NEXT: s_cbranch_execz .LBB29_2 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX11-NEXT: s_lshl_b32 s5, s1, 3 +; GFX11-NEXT: s_lshl_b32 s5, s3, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX11-NEXT: .LBB29_2: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_mov_b32 s7, exec_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -8071,18 +8070,18 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX11-NEXT: s_cbranch_execz .LBB29_4 ; GFX11-NEXT: ; %bb.3: -; GFX11-NEXT: s_bcnt1_i32_b32 s0, s7 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX11-NEXT: s_lshl_b32 s0, s1, 4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, 4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mul_f32 v1, 0x42280000, v1 ; GFX11-NEXT: ds_add_f32 v2, v1 ; GFX11-NEXT: .LBB29_4: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX11-NEXT: v_add_f32_e32 v0, s5, v0 @@ -8091,32 +8090,32 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: .LBB29_5: ; %ComputeLoop ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_ctz_i32_b32 s1, s0 +; GFX11-NEXT: s_ctz_i32_b32 s3, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readlane_b32 s6, v2, s1 -; GFX11-NEXT: s_lshl_b32 s7, 1, s1 +; GFX11-NEXT: v_readlane_b32 s6, v2, s3 +; GFX11-NEXT: s_lshl_b32 s7, 1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s7 -; GFX11-NEXT: v_writelane_b32 v0, s5, s1 +; GFX11-NEXT: s_and_not1_b32 s2, s2, s7 +; GFX11-NEXT: v_writelane_b32 v0, s5, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX11-NEXT: ; %bb.6: ; %ComputeEnd ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: ; implicit-def: $vgpr2 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execz .LBB29_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX11-NEXT: .LBB29_8: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8129,37 +8128,37 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX10-LABEL: local_ds_fadd_one_as: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s1, s5, 4 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX10-NEXT: s_add_i32 s3, s5, 4 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB29_2 ; GFX10-NEXT: ; %bb.1: ; GFX10-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s1, 3 +; GFX10-NEXT: s_lshl_b32 s5, s3, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_rtn_f32 v1, v2, v1 ; GFX10-NEXT: .LBB29_2: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_mov_b32 s7, exec_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 -; GFX10-NEXT: s_and_saveexec_b32 s6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v2 +; GFX10-NEXT: s_and_saveexec_b32 s6, s2 ; GFX10-NEXT: s_cbranch_execz .LBB29_4 ; GFX10-NEXT: ; %bb.3: -; GFX10-NEXT: s_bcnt1_i32_b32 s0, s7 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s1, 4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_bcnt1_i32_b32 s2, s7 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX10-NEXT: ds_add_f32 v2, v1 ; GFX10-NEXT: .LBB29_4: @@ -8167,36 +8166,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_mov_b32 s2, exec_lo ; GFX10-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX10-NEXT: v_add_f32_e32 v0, s5, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s5, vcc_lo ; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: .LBB29_5: ; %ComputeLoop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_ff1_i32_b32 s1, s0 +; GFX10-NEXT: s_ff1_i32_b32 s3, s2 ; GFX10-NEXT: v_readfirstlane_b32 s5, v1 -; GFX10-NEXT: v_readlane_b32 s6, v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, 1, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s7 -; GFX10-NEXT: v_writelane_b32 v0, s5, s1 +; GFX10-NEXT: v_readlane_b32 s6, v2, s3 +; GFX10-NEXT: s_lshl_b32 s7, 1, s3 +; GFX10-NEXT: s_andn2_b32 s2, s2, s7 +; GFX10-NEXT: v_writelane_b32 v0, s5, s3 ; GFX10-NEXT: v_add_f32_e32 v1, s6, v1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX10-NEXT: ; %bb.6: ; %ComputeEnd ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execz .LBB29_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX10-NEXT: .LBB29_8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -8208,10 +8207,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX90A-LABEL: local_ds_fadd_one_as: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_add_i32 s5, s5, 4 @@ -8219,9 +8218,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB29_2 ; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: s_lshl_b32 s8, s5, 3 -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8232,15 +8231,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB29_4 ; GFX90A-NEXT: ; %bb.3: -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX90A-NEXT: s_lshl_b32 s0, s5, 4 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX90A-NEXT: s_lshl_b32 s2, s5, 4 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f32 v2, v1 ; GFX90A-NEXT: .LBB29_4: ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8248,20 +8247,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX90A-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s10 -; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: s_mov_b64 s[2:3], exec ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX90A-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 ; GFX90A-NEXT: v_readlane_b32 s9, v2, s5 ; GFX90A-NEXT: s_mov_b32 m0, s5 -; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX90A-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd @@ -8269,15 +8268,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: ; implicit-def: $vgpr2 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execz .LBB29_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX90A-NEXT: .LBB29_8: -; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 ; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8289,10 +8288,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX908-LABEL: local_ds_fadd_one_as: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX908-NEXT: s_mov_b64 s[0:1], exec -; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_add_i32 s5, s5, 4 @@ -8300,9 +8299,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX908-NEXT: s_cbranch_execz .LBB29_2 ; GFX908-NEXT: ; %bb.1: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: s_lshl_b32 s8, s5, 3 -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX908-NEXT: v_mov_b32_e32 v2, s8 ; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8313,15 +8312,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_readfirstlane_b32 s10, v1 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX908-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX908-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB29_4 ; GFX908-NEXT: ; %bb.3: -; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX908-NEXT: s_lshl_b32 s0, s5, 4 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX908-NEXT: s_lshl_b32 s2, s5, 4 ; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 ; GFX908-NEXT: ds_add_f32 v2, v1 ; GFX908-NEXT: .LBB29_4: ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8329,20 +8328,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX908-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX908-NEXT: v_mov_b32_e32 v1, s10 -; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: s_mov_b64 s[2:3], exec ; GFX908-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX908-NEXT: ; implicit-def: $vgpr0 ; GFX908-NEXT: .LBB29_5: ; %ComputeLoop ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX908-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX908-NEXT: v_readfirstlane_b32 s8, v1 ; GFX908-NEXT: v_readlane_b32 s9, v2, s5 ; GFX908-NEXT: s_mov_b32 m0, s5 -; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX908-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX908-NEXT: v_writelane_b32 v0, s8, m0 -; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX908-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX908-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX908-NEXT: ; %bb.6: ; %ComputeEnd @@ -8350,15 +8349,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX908-NEXT: s_cbranch_execz .LBB29_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX908-NEXT: .LBB29_8: -; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s2, v2 ; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8370,10 +8369,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX8-LABEL: local_ds_fadd_one_as: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 -; GFX8-NEXT: s_mov_b64 s[0:1], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s5, s5, 4 @@ -8382,9 +8381,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX8-NEXT: s_cbranch_execz .LBB29_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_lshl_b32 s8, s5, 3 -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: ds_add_rtn_f32 v1, v2, v1 @@ -8395,15 +8394,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_readfirstlane_b32 s10, v1 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s8, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s9, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB29_4 ; GFX8-NEXT: ; %bb.3: -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 -; GFX8-NEXT: s_lshl_b32 s0, s5, 4 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX8-NEXT: s_lshl_b32 s2, s5, 4 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: ds_add_f32 v2, v1 ; GFX8-NEXT: .LBB29_4: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] @@ -8411,20 +8410,20 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: .LBB29_5: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX8-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s5 ; GFX8-NEXT: v_readfirstlane_b32 s8, v1 ; GFX8-NEXT: v_readlane_b32 s9, v2, s5 ; GFX8-NEXT: s_mov_b32 m0, s5 -; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] ; GFX8-NEXT: v_writelane_b32 v0, s8, m0 -; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: v_add_f32_e32 v1, s9, v1 ; GFX8-NEXT: s_cbranch_scc1 .LBB29_5 ; GFX8-NEXT: ; %bb.6: ; %ComputeEnd @@ -8432,16 +8431,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8-NEXT: s_cbranch_execz .LBB29_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 ; GFX8-NEXT: .LBB29_8: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: v_add_f32_e32 v0, s2, v0 @@ -8454,10 +8453,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: local_ds_fadd_one_as: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX7-NEXT: s_mov_b64 s[0:1], exec -; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s5, s5, 4 @@ -8469,8 +8468,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_lshl_b32 s8, s5, 3 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: ds_read_b32 v1, v2 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX7-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_2: ; %atomicrmw.start @@ -8480,8 +8479,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_2 ; GFX7-NEXT: ; %bb.3: ; %Flow22 @@ -8492,15 +8491,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_readfirstlane_b32 s10, v1 ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_cbranch_execz .LBB29_7 ; GFX7-NEXT: ; %bb.5: -; GFX7-NEXT: s_lshl_b32 s0, s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshl_b32 s2, s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: ds_read_b32 v3, v1 -; GFX7-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX7-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX7-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -8509,8 +8508,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX7-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX7-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB29_6 @@ -8520,16 +8519,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX7-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: s_mov_b64 s[0:1], exec +; GFX7-NEXT: s_mov_b64 s[2:3], exec ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7-NEXT: ; implicit-def: $vgpr0 ; GFX7-NEXT: .LBB29_8: ; %ComputeLoop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX7-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX7-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX7-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX7-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX7-NEXT: v_readfirstlane_b32 s8, v1 ; GFX7-NEXT: v_readlane_b32 s9, v2, s5 ; GFX7-NEXT: s_mov_b32 m0, s5 @@ -8542,8 +8541,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7-NEXT: ; implicit-def: $vgpr2 -; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7-NEXT: s_xor_b64 s[6:7], exec, s[2:3] ; GFX7-NEXT: s_cbranch_execz .LBB29_13 ; GFX7-NEXT: ; %bb.10: ; GFX7-NEXT: v_mov_b32_e32 v3, s4 @@ -8557,15 +8556,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: v_add_f32_e32 v2, v4, v1 ; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX7-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v4 +; GFX7-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB29_11 ; GFX7-NEXT: ; %bb.12: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: .LBB29_13: ; %Flow19 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 @@ -8578,10 +8577,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; ; GFX6-LABEL: local_ds_fadd_one_as: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s5, s5, 4 @@ -8593,8 +8592,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: s_lshl_b32 s8, s5, 3 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ds_read_b32 v1, v2 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x42280000, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_2: ; %atomicrmw.start @@ -8604,8 +8603,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v1, v4, v3 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v2, v4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v4 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, v4 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_2 ; GFX6-NEXT: ; %bb.3: ; %Flow20 @@ -8616,15 +8615,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_readfirstlane_b32 s10, v1 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v1, s8, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB29_7 ; GFX6-NEXT: ; %bb.5: -; GFX6-NEXT: s_lshl_b32 s0, s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshl_b32 s2, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: ds_read_b32 v3, v1 -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[8:9] -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[8:9] +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x42280000, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_6: ; %atomicrmw.start2 @@ -8633,8 +8632,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v4, v3, v2 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB29_6 @@ -8644,16 +8643,16 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 ; GFX6-NEXT: v_add_f32_e32 v0, s10, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: s_mov_b64 s[0:1], exec +; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: ; implicit-def: $vgpr0 ; GFX6-NEXT: .LBB29_8: ; %ComputeLoop ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1] +; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5 -; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0 ; GFX6-NEXT: v_readfirstlane_b32 s8, v1 ; GFX6-NEXT: v_readlane_b32 s9, v2, s5 ; GFX6-NEXT: s_mov_b32 m0, s5 @@ -8666,8 +8665,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX6-NEXT: ; implicit-def: $vgpr2 -; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[0:1] +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX6-NEXT: s_xor_b64 s[6:7], exec, s[2:3] ; GFX6-NEXT: s_cbranch_execz .LBB29_13 ; GFX6-NEXT: ; %bb.10: ; GFX6-NEXT: v_mov_b32_e32 v3, s4 @@ -8681,15 +8680,15 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX6-NEXT: v_add_f32_e32 v2, v4, v1 ; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v4, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v4 -; GFX6-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], v2, v4 +; GFX6-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB29_11 ; GFX6-NEXT: ; %bb.12: ; %Flow ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: .LBB29_13: ; %Flow17 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: v_add_f32_e32 v0, s4, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index d068e2ae4ec97f..8386a685a1a120 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @local_memory(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 16, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_barrier ; GCN-NEXT: ds_read_b32 v0, v0 @@ -51,7 +51,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, 12, v1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_barrier ; SI-NEXT: v_sub_i32_e32 v2, vcc, 28, v1 @@ -73,7 +73,7 @@ define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write2_b32 v1, v0, v2 offset1:4 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_barrier ; CI-NEXT: ds_read2_b32 v[3:4], v0 offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 8bd3b6ecfa6ad3..e2dee2e3aeb8d2 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -19,20 +19,20 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 -; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_mov_b32 s6, 0 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v3, s4, v1 -; MUBUF-NEXT: s_add_i32 s4, s4, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1 +; MUBUF-NEXT: s_add_i32 s6, s6, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 ; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1 @@ -47,7 +47,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; MUBUF-NEXT: v_mov_b32_e32 v6, 0 ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc @@ -58,30 +58,30 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; ; FLATSCR-LABEL: local_stack_offset_uses_sp: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, v0, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_mov_b32 s0, 0 +; FLATSCR-NEXT: s_mov_b32 s2, 0 ; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s1, s0, 0x3000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s1 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x3000 +; FLATSCR-NEXT: s_add_i32 s2, s2, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: s_addk_i32 s2, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc @@ -203,19 +203,19 @@ entry: define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp_flat: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000 -; MUBUF-NEXT: s_mov_b32 s4, 0 +; MUBUF-NEXT: s_mov_b32 s6, 0 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v2, s4, v0 -; MUBUF-NEXT: s_add_i32 s4, s4, 1 -; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 +; MUBUF-NEXT: v_add_u32_e32 v2, s6, v0 +; MUBUF-NEXT: s_add_i32 s6, s6, 1 +; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 ; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000 ; MUBUF-NEXT: buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 @@ -274,33 +274,33 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; ; FLATSCR-LABEL: local_stack_offset_uses_sp_flat: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 +; FLATSCR-NEXT: s_mov_b32 s2, 0 +; FLATSCR-NEXT: scratch_store_dword off, v0, s2 offset:1024 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s1 +; FLATSCR-NEXT: s_add_i32 s3, s2, 0x2000 +; FLATSCR-NEXT: s_add_i32 s2, s2, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x1000 +; FLATSCR-NEXT: s_addk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s2 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc +; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll index 7814eb603e5541..cc90d03e667157 100644 --- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll +++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll @@ -14,9 +14,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART @@ -26,10 +26,10 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: .LBB0_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -55,9 +55,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 ; GCN-NEXT: s_getpc_b64 s[8:9] @@ -73,10 +73,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -102,9 +102,9 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 +; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_cbranch_vccz .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 @@ -122,10 +122,10 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %bb3 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -150,7 +150,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -254,28 +254,28 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_mov_b64 s[0:1], -1 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: .Lpost_getpc5: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_4-.Lpost_getpc5)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_4-.Lpost_getpc5)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_4-.Lpost_getpc5)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_4-.Lpost_getpc5)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN-NEXT: .LBB5_1: ; %Flow -; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 ; GCN-NEXT: .LBB5_2: ; %bb2 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 17 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB5_3: ; %bb4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -294,17 +294,17 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_mov_b64 vcc, exec ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: .Lpost_getpc6: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_2-.Lpost_getpc6)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_2-.Lpost_getpc6)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc6)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc6)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: .Lpost_getpc4: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB5_3-.Lpost_getpc4)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB5_3-.Lpost_getpc4)>>32 -; GCN-NEXT: s_setpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_3-.Lpost_getpc4)&4294967295 +; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_3-.Lpost_getpc4)>>32 +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 390d1d70ff2aae..f19eeee1ca7411 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -4,12 +4,12 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_flat: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GCN-NEXT: .LBB0_2: ; %for.body @@ -50,12 +50,12 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_global: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 @@ -96,12 +96,12 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_constant: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader -; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: .LBB2_2: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -143,7 +143,7 @@ for.end: ; preds = %for.body, %entry define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { ; GCN-LABEL: copy_local: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 5484ba1ed2fe08..df3b2135e72ac1 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; ; GCN-LABEL: break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: undef_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -207,7 +207,7 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: constexpr_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -297,7 +297,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -386,7 +386,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: false_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -479,7 +479,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: invert_true_phi_cond_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s3, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index 7998d430d5f907..cb3ea2e812770c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -24,7 +24,7 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce ; GCN-NEXT: ds_write_b8 v0, v1 ; GCN-NEXT: ds_read_u8 v2, v0 offset:2 ; GCN-NEXT: ds_read_u16 v3, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b8 v0, v2 offset:6 ; GCN-NEXT: ds_write_b16 v0, v3 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll index 00dcff093c7db2..c6a734a065ff15 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -26,18 +26,18 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x2: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 ; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:256 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 ; GCN-NEXT: global_store_dword v1, v0, s[0:1] @@ -74,21 +74,21 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; ; GCN-LABEL: no_clobber_ds_load_stores_x3: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_lshl_b32 s2, s2, 2 ; GCN-NEXT: v_mov_b32_e32 v2, 3 ; GCN-NEXT: ds_write_b32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: ds_write_b32 v1, v2 offset:512 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v3, v0 offset:256 ; GCN-NEXT: ds_read_b32 v0, v0 offset:512 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: v_add_u32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 9bbcc6988e311f..00d01a080ad14a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -161,29 +161,24 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f0@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f0@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm call void @f0() call void @f1() @@ -200,36 +195,28 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm - - call void @f2() call void @f3() ret void @@ -250,35 +237,30 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f1@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f1@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 -; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:16 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, f2@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:16 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm call void @f1() %ld = load i8, ptr addrspace(3) @v3 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index e97d0e515c36d6..50595ef7ff137d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -226,37 +226,29 @@ define amdgpu_kernel void @k01() { ; GCN-LABEL: k01: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm - - call void @f0() call void @f1() ret void @@ -273,36 +265,28 @@ define amdgpu_kernel void @k23() { ; GCN-LABEL: k23: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 2 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm - - call void @f2() call void @f3() ret void @@ -323,41 +307,33 @@ define amdgpu_kernel void @k123() { ; GCN-LABEL: k123: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s11 -; GCN-NEXT: s_add_i32 s10, s10, s15 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s7 +; GCN-NEXT: s_add_i32 s6, s6, s9 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[10:11], s[8:9] -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] -; GCN-NEXT: s_mov_b64 s[16:17], s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s15, 1 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v1, v0 offset:2 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: ds_write_b8 v0, v1 offset:2 -; GCN-NEXT: s_mov_b64 s[4:5], s[16:17] -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_endpgm - +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm call void @f1() %ld = load i8, ptr addrspace(3) @v3 %mul = mul i8 %ld, 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index fef1b57db5685d..1429251fc64211 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -1,30 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s define amdgpu_kernel void @workgroup_ids_kernel() { -; GFX9-SDAG-LABEL: workgroup_ids_kernel: -; GFX9-SDAG: ; %bb.0: ; %.entry -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: workgroup_ids_kernel: -; GFX9-GISEL: ; %bb.0: ; %.entry -; GFX9-GISEL-NEXT: s_mov_b32 s0, s6 -; GFX9-GISEL-NEXT: s_mov_b32 s1, s7 -; GFX9-GISEL-NEXT: s_mov_b32 s2, s8 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 -; GFX9-GISEL-NEXT: s_endpgm +; GFX9-LABEL: workgroup_ids_kernel: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm ; ; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel: ; GFX9ARCH-SDAG: ; %bb.0: ; %.entry @@ -83,27 +72,20 @@ define amdgpu_kernel void @workgroup_ids_kernel() { define amdgpu_kernel void @caller() { ; GFX9-SDAG-LABEL: caller: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-SDAG-NEXT: s_mov_b32 s38, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-SDAG-NEXT: s_add_u32 s36, s36, s9 -; GFX9-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x0 -; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s13 +; GFX9-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s13 +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-SDAG-NEXT: s_mov_b32 s12, s6 -; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -111,27 +93,20 @@ define amdgpu_kernel void @caller() { ; ; GFX9-GISEL-LABEL: caller: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9-GISEL-NEXT: s_mov_b32 s38, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-GISEL-NEXT: s_add_u32 s36, s36, s9 -; GFX9-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 +; GFX9-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s13 +; GFX9-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s13 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-GISEL-NEXT: s_mov_b32 s12, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[14:15] @@ -139,81 +114,61 @@ define amdgpu_kernel void @caller() { ; ; GFX9ARCH-SDAG-LABEL: caller: ; GFX9ARCH-SDAG: ; %bb.0: -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s38, -1 -; GFX9ARCH-SDAG-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s36, s36, s6 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[2:3] -; GFX9ARCH-SDAG-NEXT: s_add_u32 s2, s2, callee@gotpcrel32@lo+4 -; GFX9ARCH-SDAG-NEXT: s_addc_u32 s3, s3, callee@gotpcrel32@hi+12 -; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9ARCH-SDAG-NEXT: s_add_u32 flat_scratch_lo, s10, s12 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9ARCH-SDAG-NEXT: s_add_u32 s0, s0, s12 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9ARCH-SDAG-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-SDAG-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9ARCH-SDAG-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9ARCH-SDAG-NEXT: s_endpgm ; ; GFX9ARCH-GISEL-LABEL: caller: ; GFX9ARCH-GISEL: ; %bb.0: -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s38, -1 -; GFX9ARCH-GISEL-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s36, s36, s6 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s2, 36 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s3, 0 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[0:1] -; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 -; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 -; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 flat_scratch_lo, s10, s12 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, s12 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9ARCH-GISEL-NEXT: s_getpc_b64 s[8:9] +; GFX9ARCH-GISEL-NEXT: s_add_u32 s8, s8, callee@gotpcrel32@lo+4 +; GFX9ARCH-GISEL-NEXT: s_addc_u32 s9, s9, callee@gotpcrel32@hi+12 +; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 ; GFX9ARCH-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9ARCH-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: caller: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-SDAG-NEXT: s_mov_b32 s7, callee@abs32@hi -; GFX12-SDAG-NEXT: s_mov_b32 s6, callee@abs32@lo -; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 -; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: caller: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 -; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX12-GISEL-NEXT: s_mov_b32 s6, callee@abs32@lo -; GFX12-GISEL-NEXT: s_mov_b32 s7, callee@abs32@hi -; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] -; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX12-GISEL-NEXT: s_endpgm +; GFX12-LABEL: caller: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, callee@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, callee@gotpcrel32@hi+16 +; GFX12-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX12-NEXT: s_mov_b32 s32, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX12-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workgroup.id.x() call void @callee(i32 %idx) #0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll index 2963e7b765a0d1..7830bfc6ac7f59 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -24,8 +24,8 @@ define amdgpu_kernel void @zext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -45,8 +45,8 @@ define amdgpu_kernel void @sext_shl64_to_32(ptr addrspace(1) nocapture %out, i32 define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -67,8 +67,8 @@ define amdgpu_kernel void @zext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 @@ -89,7 +89,7 @@ define amdgpu_kernel void @sext_shl64_overflow(ptr addrspace(1) nocapture %out, define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: mulu24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_and_b32_e32 v0, 6, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -112,7 +112,7 @@ bb: define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index e8ac1b2887c36e..994ef22539a65f 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -63,7 +63,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 @@ -79,7 +79,7 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -90,7 +90,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -131,7 +131,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -142,9 +142,7 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -169,20 +167,20 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -203,22 +201,22 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -226,10 +224,9 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -239,12 +236,9 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -266,20 +260,20 @@ define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: lshr_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -300,22 +294,22 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_and_b32 s0, s0, 0xffff -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_and_b32 s1, s8, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s1, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -323,10 +317,9 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: lshr_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -336,12 +329,9 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: lshr_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -363,7 +353,7 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -374,7 +364,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -394,7 +384,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -414,7 +404,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -425,9 +415,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -450,7 +438,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -461,7 +449,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -480,7 +468,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -497,7 +485,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -508,9 +496,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -533,7 +519,7 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -545,7 +531,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -567,7 +553,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -596,7 +582,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -608,9 +594,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_lshr_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -636,7 +620,7 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: lshr_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -648,7 +632,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: lshr_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -670,7 +654,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -689,7 +673,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX10-LABEL: lshr_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -701,9 +685,7 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace ; ; GFX11-LABEL: lshr_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index 3032b1028dc2d2..995c8c8679397c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @mad_u16( ; GFX8-LABEL: mad_u16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @mad_u16( ; ; GFX9-LABEL: mad_u16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -51,7 +51,7 @@ define amdgpu_kernel void @mad_u16( ; ; GFX10-LABEL: mad_u16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -67,10 +67,8 @@ define amdgpu_kernel void @mad_u16( ; ; GFX11-LABEL: mad_u16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index e876a8d9dda692..620566d3baff38 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s8, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 8eb0a46cc8b17f..400298bcff4f97 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -908,8 +908,8 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; CI-LABEL: mad_i64_i32_uniform: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -924,33 +924,33 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; ; SI-LABEL: mad_i64_i32_uniform: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mul_hi_u32 v1, s6, v0 -; SI-NEXT: s_mul_i32 s2, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: mad_i64_i32_uniform: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s2, s6, s7 -; GFX9-NEXT: s_add_u32 s0, s3, s0 -; GFX9-NEXT: s_addc_u32 s1, s2, s1 +; GFX9-NEXT: s_mul_i32 s0, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -958,8 +958,8 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX11-LABEL: mad_i64_i32_uniform: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s2, s6, s7 ; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7 @@ -975,8 +975,8 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX12-LABEL: mad_i64_i32_uniform: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index b8b4d4440d5809..9ec37a5e14cdf9 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -15,18 +15,18 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -56,12 +56,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -70,13 +70,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-MAD-LABEL: madak_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -85,10 +85,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-MAD-LABEL: madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -105,13 +103,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX940-FMA-LABEL: madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -120,13 +117,13 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-FMA-LABEL: madak_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -135,10 +132,8 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac ; GFX11-FMA-LABEL: madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -170,7 +165,7 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { ; GFX6-LABEL: madak_2_use_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -195,7 +190,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX8-LABEL: madak_2_use_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -225,7 +220,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -245,7 +240,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -264,9 +259,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-MAD-LABEL: madak_2_use_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -276,9 +269,9 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-MAD-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_add_f32 v2, 0x41200000, v2 -; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2 ; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc @@ -289,8 +282,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -310,7 +302,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -329,9 +321,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX11-FMA-LABEL: madak_2_use_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -375,7 +365,7 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 { ; GFX6-LABEL: madak_m_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -391,7 +381,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX8-LABEL: madak_m_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -408,7 +398,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -419,7 +409,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] @@ -430,14 +420,13 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-MAD-NEXT: s_nop 0 @@ -446,8 +435,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] @@ -458,7 +446,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] @@ -469,9 +457,7 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] @@ -498,18 +484,18 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: madak_inline_imm_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -517,8 +503,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX8-LABEL: madak_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -539,12 +525,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX9-LABEL: madak_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -553,13 +539,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-MAD-LABEL: madak_inline_imm_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -568,10 +554,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-MAD-LABEL: madak_inline_imm_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -588,13 +572,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; ; GFX940-FMA-LABEL: madak_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -603,13 +586,13 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX10-FMA-LABEL: madak_inline_imm_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -618,10 +601,8 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p ; GFX11-FMA-LABEL: madak_inline_imm_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -651,26 +632,26 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 { ; GFX6-LABEL: s_v_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_v_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -688,23 +669,22 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX9-LABEL: s_v_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mac_f32_e32 v2, s0, v1 +; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1 ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_v_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) @@ -715,15 +695,14 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-MAD-LABEL: s_v_madak_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-MAD-NEXT: s_nop 0 @@ -732,24 +711,22 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; ; GFX940-FMA-LABEL: s_v_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s0, v1 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1 ; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_v_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) @@ -760,10 +737,8 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr ; GFX11-FMA-LABEL: s_v_madak_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] @@ -788,84 +763,82 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: v_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mac_f32_e32 v3, s0, v2 +; GFX6-NEXT: v_mac_f32_e32 v3, s2, v2 ; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: v_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s2, s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mac_f32_e32 v2, s2, v3 +; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e32 v2, s4, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: v_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-MAD-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: v_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 ; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] @@ -875,47 +848,44 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a ; ; GFX940-FMA-LABEL: v_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[0:1] -; GFX940-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s4, v1 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: v_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: v_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000 +; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -935,7 +905,7 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX6-LABEL: s_s_madak_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 @@ -949,7 +919,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX8-LABEL: s_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -961,7 +931,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -972,7 +942,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 @@ -982,7 +952,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-MAD-LABEL: s_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -994,7 +964,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,7 +975,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 @@ -1015,7 +985,7 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX11-FMA-LABEL: s_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1033,19 +1003,19 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src0_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1053,8 +1023,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src0_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1076,13 +1046,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src0_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1091,13 +1061,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1106,10 +1076,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src0_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1126,14 +1094,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1142,13 +1109,13 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1157,10 +1124,8 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src0_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1191,19 +1156,19 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 { ; GFX6-LABEL: no_madak_src1_modifier_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_mov_b64 s[2:3], s[10:11] -; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX6-NEXT: s_mov_b32 s0, 0x41200000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -1211,8 +1176,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX8-LABEL: no_madak_src1_modifier_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1234,13 +1199,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX9-LABEL: no_madak_src1_modifier_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, 0x41200000 +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1249,13 +1214,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX10-MAD: ; %bb.0: ; GFX10-MAD-NEXT: s_clause 0x1 -; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-MAD-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: s_clause 0x1 ; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000 ; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] @@ -1264,10 +1229,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-MAD-LABEL: no_madak_src1_modifier_f32: ; GFX11-MAD: ; %bb.0: ; GFX11-MAD-NEXT: s_clause 0x1 -; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-MAD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: s_clause 0x1 @@ -1284,14 +1247,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; ; GFX940-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX940-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[0:1] -; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s0 ; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 @@ -1300,13 +1262,13 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX10-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX10-FMA: ; %bb.0: ; GFX10-FMA-NEXT: s_clause 0x1 -; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-FMA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: s_clause 0x1 ; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000 ; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] @@ -1315,10 +1277,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % ; GFX11-FMA-LABEL: no_madak_src1_modifier_f32: ; GFX11-FMA: ; %bb.0: ; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_clause 0x1 @@ -1352,36 +1312,36 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias % define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { ; GFX6-LABEL: madak_constant_bus_violation: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_cmp_lg_u32 s0, 0 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX6-NEXT: ; %bb.1: ; %bb3 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: .LBB9_2: ; %bb4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x12 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x12 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5 ; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: madak_constant_bus_violation: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 +; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; %bb3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -1390,7 +1350,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX8-NEXT: .LBB9_2: ; %bb4 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1401,9 +1361,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX9-LABEL: madak_constant_bus_violation: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %bb3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -1412,7 +1372,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX9-NEXT: .LBB9_2: ; %bb4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5 @@ -1423,9 +1383,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-MAD-LABEL: madak_constant_bus_violation: ; GFX10-MAD: ; %bb.0: ; %bb -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-MAD-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-MAD-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1434,7 +1394,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000 @@ -1445,9 +1405,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-MAD-LABEL: madak_constant_bus_violation: ; GFX11-MAD: ; %bb.0: ; %bb -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-MAD-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-MAD-NEXT: ; %bb.1: ; %bb3 ; GFX11-MAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1456,7 +1416,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-MAD-NEXT: .LBB9_2: ; %bb4 ; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x48 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1470,9 +1430,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX940-FMA-LABEL: madak_constant_bus_violation: ; GFX940-FMA: ; %bb.0: ; %bb -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX940-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX940-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1481,7 +1441,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX940-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX940-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5 @@ -1492,9 +1452,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX10-FMA-LABEL: madak_constant_bus_violation: ; GFX10-FMA: ; %bb.0: ; %bb -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-FMA-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX10-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1503,7 +1463,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX10-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: s_load_dword s0, s[2:3], 0x48 +; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x48 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000 @@ -1514,9 +1474,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; ; GFX11-FMA-LABEL: madak_constant_bus_violation: ; GFX11-FMA: ; %bb.0: ; %bb -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-FMA-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX11-FMA-NEXT: ; %bb.1: ; %bb3 ; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 @@ -1525,7 +1485,7 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl ; GFX11-FMA-NEXT: .LBB9_2: ; %bb4 ; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_load_b32 s0, s[2:3], 0x48 +; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x48 ; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 92536c2078514a..c7a831185b83c6 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-LABEL: test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x1c -; GFX9-NEXT: s_load_dword s5, s[6:7], 0x38 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s7, s[4:5], 0x1c +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x38 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s10 -; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 +; GFX9-NEXT: s_and_b32 s4, s7, 0xffff +; GFX9-NEXT: s_mul_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: v_add_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -34,13 +34,13 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-LABEL: test: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[6:7], 0x1c -; GFX10-NEXT: s_load_dword s5, s[6:7], 0x38 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s7, s[4:5], 0x1c +; GFX10-NEXT: s_load_dword s8, s[4:5], 0x38 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s10, v0 +; GFX10-NEXT: s_and_b32 s4, s7, 0xffff +; GFX10-NEXT: s_mul_i32 s6, s6, s4 +; GFX10-NEXT: v_add3_u32 v0, s8, s6, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 @@ -59,16 +59,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-LABEL: test: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x1c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x38 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x1c +; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x38 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s13, s13, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add3_u32 v0, s5, s13, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s15, s15, s4 +; GFX11-NEXT: v_add3_u32 v0, s5, s15, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index e929da796de6de..2b5d32fa7b9776 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_store_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_mov_b32 s1, s0 @@ -91,7 +91,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_store_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_store_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s0, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 @@ -397,7 +397,7 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; GFX10-LABEL: long_load_chain: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3e ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -670,7 +670,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX11-LABEL: long_load_chain: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 @@ -944,7 +944,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) { ; ; GFX12-LABEL: long_load_chain: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f ; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index a8139cc6bc4c95..8ef2ca2765e8a1 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -28,12 +28,12 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -54,8 +54,8 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -78,12 +78,12 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -104,8 +104,8 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -139,19 +139,19 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -175,8 +175,8 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sge_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -202,12 +202,12 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imax_sge_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 @@ -229,8 +229,8 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_imax_sgt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -251,12 +251,12 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imax_sgt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -277,8 +277,8 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_uge_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -299,12 +299,12 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_uge_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -325,8 +325,8 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -347,12 +347,12 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umax_ugt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -372,8 +372,8 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; VI-LABEL: v_test_umax_ugt_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -396,12 +396,12 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index 4fb90bbc46a8f5..bef9ff82aa396c 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_i32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i32: @@ -58,26 +58,26 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v3, s7, v3 -; SI-NEXT: v_max_i32_e32 v2, s6, v2 -; SI-NEXT: v_max_i32_e32 v1, s5, v1 -; SI-NEXT: v_max_i32_e32 v0, s4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: v_max_i32_e32 v3, s11, v3 +; SI-NEXT: v_max_i32_e32 v2, s10, v2 +; SI-NEXT: v_max_i32_e32 v1, s9, v1 +; SI-NEXT: v_max_i32_e32 v0, s8, v0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_v4i32: @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -146,8 +146,8 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sge_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -175,24 +175,24 @@ define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_sbyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sge_i8: @@ -240,8 +240,8 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -269,7 +269,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_imax_sgt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -303,23 +303,23 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_i32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_i32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_imax_sgt_i32: @@ -355,7 +355,7 @@ define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_imax_sgt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -385,23 +385,23 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_u32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i32: @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_uge_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -467,20 +467,20 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32> %a, <3 x i32> %b) nounwind { ; SI-LABEL: s_test_umax_uge_v3i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_max_u32 s2, s6, s10 -; SI-NEXT: s_max_u32 s0, s5, s9 -; SI-NEXT: s_max_u32 s1, s4, s8 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8 +; SI-NEXT: s_max_u32 s6, s6, s10 +; SI-NEXT: s_max_u32 s5, s5, s9 +; SI-NEXT: s_max_u32 s4, s4, s8 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: s_test_umax_uge_v3i32: @@ -507,24 +507,24 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32 define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_uge_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_uge_i8: @@ -565,20 +565,20 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dword s0, s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dword s4, s[4:5], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: v_max_u32_e32 v0, s0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_max_u32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_ugt_i32: @@ -614,7 +614,7 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_test_umax_ugt_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -644,7 +644,7 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3 define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: s_test_umax_ugt_imm_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -680,9 +680,9 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_umax_ugt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -727,9 +727,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspac define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind { ; SI-LABEL: simplify_demanded_bits_test_max_slt_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -773,9 +773,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind { ; SI-LABEL: s_test_imax_sge_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -826,8 +826,8 @@ define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_ugt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -868,8 +868,8 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_umax_uge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -910,8 +910,8 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sgt_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -952,8 +952,8 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: test_imax_sge_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0a76e169e9c385..ae1f31272a15f0 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 { ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -121,7 +121,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 { ; CHECK-LABEL: memcpy_p1_p1_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -145,7 +145,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p1_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -181,12 +181,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p5_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s8, s8, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -206,52 +206,52 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 @@ -262,229 +262,229 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 @@ -492,13 +492,13 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 @@ -506,54 +506,54 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -563,32 +563,32 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { ; CHECK-LABEL: memcpy_p0_p5_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -601,287 +601,287 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 @@ -891,20 +891,20 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 @@ -935,7 +935,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p3_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -971,7 +971,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 ; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 @@ -1254,7 +1254,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 @@ -1367,7 +1367,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 { ; CHECK-LABEL: memcpy_p1_p1_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32 @@ -1391,7 +1391,7 @@ entry: define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p1_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] @@ -1427,12 +1427,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p5_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s13 +; CHECK-NEXT: s_add_u32 s8, s8, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 @@ -1452,52 +1452,52 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 @@ -1508,229 +1508,229 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71 ; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65 ; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64 ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82 ; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 @@ -1738,13 +1738,13 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 ; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100 ; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 ; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 ; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 @@ -1752,54 +1752,54 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 ; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107 ; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101 ; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 ; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 ; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 ; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104 ; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 ; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 ; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99 +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125 +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124 +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121 +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -1809,32 +1809,32 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { ; CHECK-LABEL: memcpy_p0_p5_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s13 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 @@ -1847,287 +1847,287 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 ; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 ; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 ; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 ; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 ; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 @@ -2137,20 +2137,20 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 ; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 ; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 ; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 @@ -2181,7 +2181,7 @@ entry: define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p3_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] @@ -2217,7 +2217,7 @@ entry: define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 ; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll index 3a6d8ca1e35f60..f60728c16a3ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll @@ -9,7 +9,7 @@ define void @memcpy_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -26,7 +26,7 @@ define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz31_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s8 @@ -34,7 +34,7 @@ define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-NEXT: v_mov_b32_e32 v4, s10 ; CHECK-NEXT: v_mov_b32_e32 v5, s11 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[6:7] offset:15 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[4:5] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -47,7 +47,7 @@ define void @memcpy_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll index b32bfd0e495ba1..1b8483a54bb3bf 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll @@ -9,7 +9,7 @@ define void @memmove_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz16_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -27,8 +27,8 @@ define void @memmove_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: global_load_ubyte v9, v2, s[6:7] offset:30 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: global_load_ubyte v9, v2, s[4:5] offset:30 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s5 @@ -53,7 +53,7 @@ define void @memmove_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr add ; CHECK-LABEL: memmove_p1_p4_sz32_align_4_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s8 ; CHECK-NEXT: v_mov_b32_e32 v3, s9 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 3a065d518f0a9d..c49e0501665c57 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] @@ -24,7 +24,7 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 @@ -69,7 +69,7 @@ bb: define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -98,7 +98,7 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 @@ -250,11 +250,11 @@ bb: define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) { ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[0:1] +; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 @@ -267,20 +267,20 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap ; ; GCN-SCRATCH-LABEL: vector_clause_indirect: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[0:1] +; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x1 ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off ; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -384,10 +384,10 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; GCN-NEXT: s_mov_b32 s19, 0xe00000 -; GCN-NEXT: s_add_u32 s16, s16, s9 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44 +; GCN-NEXT: s_add_u32 s16, s16, s3 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 @@ -411,13 +411,13 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; ; GCN-SCRATCH-LABEL: flat_scratch_load: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 +; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x24 -; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x44 +; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1 ; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8 @@ -453,22 +453,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32> %desc) { ; GCN-LABEL: flat_scratch_load_clause: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: s_mov_b32 s15, 0xe00000 -; GCN-NEXT: s_add_u32 s12, s12, s9 -; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xe00000 +; GCN-NEXT: s_add_u32 s4, s4, s3 +; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 -; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x40d00000 -; GCN-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: exp mrt0 v0, off, off, off done vm @@ -476,10 +476,10 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32 ; ; GCN-SCRATCH-LABEL: flat_scratch_load_clause: ; GCN-SCRATCH: ; %bb.0: ; %.entry -; GCN-SCRATCH-NEXT: s_add_u32 s6, s6, s11 -; GCN-SCRATCH-NEXT: s_addc_u32 s7, s7, 0 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5 +; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000 ; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off diff --git a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll index 9c2b437a08f088..7bb09f6697b685 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll @@ -18,7 +18,7 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: ; GCN: v_mfma_f32_32x32x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %a = bitcast i32 1 to <2 x i16> @@ -30,7 +30,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: ; GCN: v_mfma_f32_16x16x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: ; GCN: v_mfma_f32_4x4x2bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -50,7 +50,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: ; GCN: v_mfma_f32_32x32x4bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -60,7 +60,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: ; GCN: v_mfma_f32_16x16x8bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -70,7 +70,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k: ; GCN: v_mfma_f32_32x32x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -80,7 +80,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k: ; GCN: v_mfma_f32_16x16x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -90,7 +90,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k: ; GCN: v_mfma_f32_4x4x4bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -100,7 +100,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k: ; GCN: v_mfma_f32_32x32x8bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -110,7 +110,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k: ; GCN: v_mfma_f32_16x16x16bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -120,7 +120,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64: ; GCN: v_mfma_f64_4x4x4f64 v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+:[0-9]+}} -define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) { bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double 1.0, double 1.0, double 128.0, i32 0, i32 0, i32 0) store double %mai.1, ptr addrspace(1) %arg @@ -129,7 +129,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64: ; GCN: v_mfma_f64_16x16x4f64 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x double>, ptr addrspace(1) %arg %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double 1.0, double 1.0, <4 x double> %in.1, i32 0, i32 0, i32 0) @@ -139,7 +139,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: ; GCN: v_mfma_i32_32x32x8i8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -149,12 +149,10 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: ; GCN: v_mfma_i32_16x16x16i8 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index e0708a55f438bd..ba34c1bbe1d710 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -19,7 +19,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #1 { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -29,7 +29,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) #0 { bb: %acc = call i32 asm sideeffect "; def $0", "={a0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) #0 { bb: call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> undef) %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -63,7 +63,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #0 { bb: call void @foo() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -78,7 +78,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb: ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #0 { bb1: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) @@ -106,6 +106,5 @@ bb: declare void @foo() -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" "amdgpu-no-agpr" } -attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } -attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } +attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll index b48152dad99ac3..59b13c02f92fb9 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll @@ -30,7 +30,7 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i3 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8: ; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 0, i32 0, i32 0) @@ -40,7 +40,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8: ; GCN: v_mfma_i32_32x32x16_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -50,7 +50,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32: ; GCN: v_mfma_f32_16x16x8_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> , <2 x float> , <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -60,7 +60,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32: ; GCN: v_mfma_f32_32x32x4_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> , <2 x float> , <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -70,7 +70,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8: ; GCN: v_mfma_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -80,7 +80,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8: ; GCN: v_mfma_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -90,7 +90,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8: ; GCN: v_mfma_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -100,7 +100,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8: ; GCN: v_mfma_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -110,7 +110,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8: ; GCN: v_mfma_f32_32x32x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -120,7 +120,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8: ; GCN: v_mfma_f32_32x32x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -130,7 +130,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8: ; GCN: v_mfma_f32_32x32x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -140,7 +140,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8: ; GCN: v_mfma_f32_32x32x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -150,7 +150,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16: ; GCN: v_smfmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -160,7 +160,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16: ; GCN: v_smfmac_f32_32x32x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -170,7 +170,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16: ; GCN: v_smfmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -180,7 +180,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16: ; GCN: v_smfmac_f32_32x32x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -190,7 +190,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8: ; GCN: v_smfmac_i32_16x16x64_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 0, i32 0) @@ -200,7 +200,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8: ; GCN: v_smfmac_i32_32x32x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 0, i32 0) @@ -210,7 +210,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8: ; GCN: v_smfmac_f32_16x16x64_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -220,7 +220,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8: ; GCN: v_smfmac_f32_16x16x64_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -230,7 +230,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8: ; GCN: v_smfmac_f32_16x16x64_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -240,7 +240,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8: ; GCN: v_smfmac_f32_16x16x64_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -250,7 +250,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8: ; GCN: v_smfmac_f32_32x32x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -260,7 +260,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8: ; GCN: v_smfmac_f32_32x32x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -270,7 +270,7 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8: ; GCN: v_smfmac_f32_32x32x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) @@ -280,12 +280,10 @@ bb: ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8: ; GCN: v_smfmac_f32_32x32x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}} -define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { +define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll index bffd15872c42cb..06775f5d3f92b2 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll @@ -19,7 +19,7 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: ; GCN: v_mfma_f32_32x32x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -29,7 +29,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: ; GCN: v_mfma_f32_16x16x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -39,7 +39,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: ; GCN: v_mfma_f32_4x4x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -49,7 +49,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: ; GCN: v_mfma_f32_32x32x2{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -59,7 +59,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -69,7 +69,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: ; GCN: v_mfma_f32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> undef, <4 x half> undef, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -79,7 +79,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -89,7 +89,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: ; GCN: v_mfma_f32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -99,7 +99,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: ; GCN: v_mfma_f32_32x32x8{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0) @@ -109,7 +109,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: ; GCN: v_mfma_f32_16x16x16{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -119,7 +119,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: ; GCN: v_mfma_i32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) { bb: %in.1 = load <32 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 1, <32 x i32> %in.1, i32 0, i32 0, i32 0) @@ -129,7 +129,7 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: ; GCN: v_mfma_i32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) { bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0) @@ -139,12 +139,10 @@ bb: ; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: ; GCN: v_mfma_i32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { +define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) { bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index a77892c8f5fc7b..9dafa27ece86f6 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -31,8 +31,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,8 +53,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -75,12 +75,12 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -89,13 +89,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_sle_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -104,10 +104,8 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_sle_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -145,7 +143,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -156,7 +154,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -167,7 +165,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_sle_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -177,7 +175,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_sle_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -187,7 +185,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_sle_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -217,7 +215,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -228,7 +226,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -239,7 +237,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -249,7 +247,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_imin_sle_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -259,7 +257,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_imin_sle_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -292,8 +290,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; CI-LABEL: s_test_imin_sle_v4i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s11, s15 ; CI-NEXT: s_min_i32 s3, s10, s14 @@ -310,8 +308,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; VI-LABEL: s_test_imin_sle_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s11, s15 ; VI-NEXT: s_min_i32 s3, s10, s14 @@ -328,8 +326,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; GFX9-LABEL: s_test_imin_sle_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s11, s15 @@ -346,8 +344,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX10-LABEL: s_test_imin_sle_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s11, s15 @@ -364,8 +362,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; GFX11-LABEL: s_test_imin_sle_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s7, s11 @@ -419,9 +417,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; CI-LABEL: s_test_imin_sle_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 @@ -434,9 +432,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 @@ -449,9 +447,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 @@ -464,9 +462,9 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 @@ -479,13 +477,13 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i8 s2, s4 -; GFX11-NEXT: s_sext_i32_i8 s3, s5 +; GFX11-NEXT: s_sext_i32_i8 s2, s2 +; GFX11-NEXT: s_sext_i32_i8 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -556,9 +554,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; CI-LABEL: s_test_imin_sle_v4i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -589,9 +587,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2 ; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3 @@ -618,9 +616,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 @@ -646,9 +644,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-LABEL: s_test_imin_sle_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 @@ -675,27 +673,29 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX11-LABEL: s_test_imin_sle_v4i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x4c +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_ashrrev_i16 v0, 8, s0 -; GFX11-NEXT: v_ashrrev_i16 v1, 8, s1 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: v_ashrrev_i16 v0, 8, s2 +; GFX11-NEXT: v_ashrrev_i16 v1, 8, s3 ; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4 ; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5 -; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 +; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX11-NEXT: v_min_i16 v4, s0, s1 +; GFX11-NEXT: v_min_i16 v4, s2, s3 ; GFX11-NEXT: v_min_i16 v5, s4, s5 ; GFX11-NEXT: v_min_i16 v2, v2, v3 ; GFX11-NEXT: v_min_i16 v0, v0, v1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -707,7 +707,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -752,7 +751,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -771,7 +770,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -790,7 +789,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -800,7 +799,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX10-LABEL: s_test_imin_sle_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3 @@ -809,7 +808,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX11-LABEL: s_test_imin_sle_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 @@ -904,8 +903,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -934,8 +933,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -964,34 +963,34 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7 @@ -1031,8 +1030,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1053,8 +1052,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1075,12 +1074,12 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1089,13 +1088,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1104,10 +1103,8 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1172,8 +1169,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_imin_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1194,8 +1191,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_imin_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1216,12 +1213,12 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_imin_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] @@ -1230,13 +1227,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_imin_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] +; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -1245,10 +1242,8 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_imin_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1287,7 +1282,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1298,7 +1293,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1309,7 +1304,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_imin_slt_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, s3 @@ -1319,7 +1314,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_imin_slt_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, s3 @@ -1329,7 +1324,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_slt_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s2, s3 @@ -1360,8 +1355,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1374,8 +1369,8 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1388,36 +1383,36 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s1, s1, s3 ; GFX9-NEXT: s_min_i32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s0, s0, s2 ; GFX10-NEXT: s_min_i32 s1, s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_i32 s2, s4, s6 @@ -1448,8 +1443,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_slt_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1460,8 +1455,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_slt_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1472,8 +1467,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_slt_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1484,8 +1479,8 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_slt_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1496,11 +1491,11 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_slt_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1527,8 +1522,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; CI-LABEL: s_test_imin_sle_imm_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1539,8 +1534,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; VI-LABEL: s_test_imin_sle_imm_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1551,8 +1546,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX9-LABEL: s_test_imin_sle_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_i32 s2, s2, 8 @@ -1563,8 +1558,8 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX10-LABEL: s_test_imin_sle_imm_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_i32 s2, s2, 8 @@ -1575,11 +1570,11 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; GFX11-LABEL: s_test_imin_sle_imm_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s4, 8 +; GFX11-NEXT: s_min_i32 s2, s2, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1617,8 +1612,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1639,8 +1634,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1661,12 +1656,12 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -1675,13 +1670,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ule_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -1690,10 +1685,8 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ule_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1748,8 +1741,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1772,8 +1765,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1796,12 +1789,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1812,13 +1805,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] +; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4 @@ -1829,10 +1822,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -1911,8 +1902,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_umin_ule_v3i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1947,8 +1938,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_umin_ule_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1975,12 +1966,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_umin_ule_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2 @@ -1991,13 +1982,13 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_umin_ule_v3i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2 @@ -2008,10 +1999,8 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_umin_ule_v3i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2053,7 +2042,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2064,7 +2053,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2075,7 +2064,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ule_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2085,7 +2074,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ule_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2095,7 +2084,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ule_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2136,8 +2125,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; CI-LABEL: v_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2158,8 +2147,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: v_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2180,12 +2169,12 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -2194,13 +2183,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX10-LABEL: v_test_umin_ult_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -2209,10 +2198,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; GFX11-LABEL: v_test_umin_ult_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2268,8 +2255,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; CI-LABEL: v_test_umin_ult_i8: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2289,8 +2276,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_test_umin_ult_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2310,11 +2297,11 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: v_test_umin_ult_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] @@ -2323,12 +2310,12 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX10-LABEL: v_test_umin_ult_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] @@ -2337,9 +2324,8 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; GFX11-LABEL: v_test_umin_ult_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] @@ -2377,7 +2363,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2388,7 +2374,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2399,7 +2385,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX9-LABEL: s_test_umin_ult_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2409,7 +2395,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX10-LABEL: s_test_umin_ult_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2419,7 +2405,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ult_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2471,7 +2457,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2492,7 +2478,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2513,7 +2499,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i32_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2531,7 +2517,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i32_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 @@ -2549,7 +2535,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i32_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 @@ -2621,7 +2607,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2643,7 +2629,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2665,7 +2651,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX9-LABEL: v_test_umin_ult_i16_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -2680,7 +2666,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX10-LABEL: v_test_umin_ult_i16_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -2696,7 +2682,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX11-LABEL: v_test_umin_ult_i16_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -2735,7 +2721,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2746,7 +2732,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2757,7 +2743,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v1i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s2, s2, s3 @@ -2767,7 +2753,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX10-LABEL: s_test_umin_ult_v1i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s2, s2, s3 @@ -2777,7 +2763,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s2, s3 @@ -2818,8 +2804,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x8 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s4, s11, s19 ; CI-NEXT: s_min_u32 s5, s10, s18 @@ -2849,8 +2835,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s4, s11, s19 ; VI-NEXT: s_min_u32 s5, s10, s18 @@ -2880,8 +2866,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v8i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_min_u32 s4, s9, s17 @@ -2908,8 +2894,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX10-LABEL: s_test_umin_ult_v8i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x20 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_min_u32 s4, s9, s17 @@ -2935,8 +2921,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; GFX11-LABEL: s_test_umin_ult_v8i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x20 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x20 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_min_u32 s2, s7, s15 @@ -3109,8 +3095,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s2, s8, 16 ; CI-NEXT: s_and_b32 s3, s8, 0xffff @@ -3155,8 +3141,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; VI-LABEL: s_test_umin_ult_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s11, 16 ; VI-NEXT: s_lshr_b32 s4, s10, 16 @@ -3201,8 +3187,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; GFX9-LABEL: s_test_umin_ult_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s15 @@ -3219,8 +3205,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX10-LABEL: s_test_umin_ult_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v3, s11, s15 @@ -3233,8 +3219,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; GFX11-LABEL: s_test_umin_ult_v8i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_min_u16 v3, s7, s11 @@ -3277,9 +3263,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3292,9 +3278,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3307,9 +3293,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -3322,9 +3308,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff @@ -3337,13 +3323,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff -; GFX11-NEXT: s_and_b32 s3, s5, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_u32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3386,9 +3372,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0xa -; CI-NEXT: s_load_dword s3, s[6:7], 0x13 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0xa +; CI-NEXT: s_load_dword s3, s[4:5], 0x13 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 @@ -3401,9 +3387,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x28 -; VI-NEXT: s_load_dword s3, s[6:7], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x28 +; VI-NEXT: s_load_dword s3, s[4:5], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 @@ -3416,9 +3402,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s2, s2 @@ -3431,9 +3417,9 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x28 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s2, s2 @@ -3446,13 +3432,13 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x28 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_sext_i32_i16 s3, s5 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 +; GFX11-NEXT: s_sext_i32_i16 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 @@ -3503,8 +3489,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; CI-LABEL: s_test_imin_sle_i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3517,8 +3503,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; VI-LABEL: s_test_imin_sle_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3531,8 +3517,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; GFX9-LABEL: s_test_imin_sle_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s3, s2 @@ -3545,8 +3531,8 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-LABEL: s_test_imin_sle_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s3, s2 @@ -3559,14 +3545,14 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-LABEL: s_test_imin_sle_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s2, s4 -; GFX11-NEXT: s_ashr_i32 s3, s4, 16 +; GFX11-NEXT: s_sext_i32_i16 s3, s2 +; GFX11-NEXT: s_ashr_i32 s2, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s2, s3, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -3599,8 +3585,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ult_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3617,8 +3603,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ult_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3635,16 +3621,16 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ult_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3653,14 +3639,14 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ult_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3669,8 +3655,8 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ult_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1] @@ -3709,8 +3695,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_umin_ule_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3727,8 +3713,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_umin_ule_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3745,16 +3731,16 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_umin_ule_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3763,14 +3749,14 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_umin_ule_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_le_u64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3779,8 +3765,8 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_umin_ule_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1] @@ -3819,8 +3805,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_slt_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3837,8 +3823,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_slt_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3855,16 +3841,16 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_slt_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3873,14 +3859,14 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_slt_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3889,8 +3875,8 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_slt_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1] @@ -3929,8 +3915,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; CI-LABEL: test_imin_sle_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3947,8 +3933,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; VI-LABEL: test_imin_sle_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3965,16 +3951,16 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX9-LABEL: test_imin_sle_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3983,14 +3969,14 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX10-LABEL: test_imin_sle_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: v_cmp_le_i64_e64 s4, s[2:3], s[6:7] +; GFX10-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10-NEXT: s_cselect_b32 s2, s2, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -3999,8 +3985,8 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; GFX11-LABEL: test_imin_sle_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1] @@ -4062,8 +4048,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_sle_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4093,8 +4079,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_sle_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4117,12 +4103,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_sle_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4131,13 +4117,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_sle_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4146,10 +4132,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_sle_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -4214,8 +4198,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; CI-LABEL: v_test_imin_ule_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -4244,8 +4228,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_test_imin_ule_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -4268,12 +4252,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: v_test_imin_ule_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] @@ -4282,13 +4266,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: v_test_imin_ule_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] @@ -4297,10 +4281,8 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; GFX11-LABEL: v_test_imin_ule_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index a5995bd79ec383..c5c7bca3037d01 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -77,128 +77,96 @@ define void @nonkernel() { define amdgpu_kernel void @withcall() { ; GFX9-LABEL: withcall: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_mov_b32 s23, 0xe00000 -; GFX9-NEXT: s_add_u32 s20, s20, s9 -; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_add_u32 s8, s2, 36 -; GFX9-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] +; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-NEXT: s_add_u32 s12, s12, s3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s8, s0, 36 +; GFX9-NEXT: s_addc_u32 s9, s1, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: ds_write_b32 v3, v3 offset:8 +; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: withcall: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s22, -1 -; GFX10-NEXT: s_mov_b32 s23, 0x31c16000 -; GFX10-NEXT: s_add_u32 s20, s20, s9 -; GFX10-NEXT: s_addc_u32 s21, s21, 0 -; GFX10-NEXT: s_mov_b32 s14, s8 -; GFX10-NEXT: s_add_u32 s8, s2, 36 -; GFX10-NEXT: s_addc_u32 s9, s3, 0 -; GFX10-NEXT: s_getpc_b64 s[2:3] -; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX10-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX10-NEXT: s_mov_b32 s12, s6 -; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] +; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s14, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-NEXT: s_add_u32 s12, s12, s3 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_add_u32 s8, s0, 36 +; GFX10-NEXT: s_addc_u32 s9, s1, 0 +; GFX10-NEXT: s_getpc_b64 s[0:1] +; GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX10-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX10-NEXT: s_mov_b32 s32, 0 -; GFX10-NEXT: ds_write_b32 v3, v3 offset:8 +; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: s_endpgm ; ; G_GFX9-LABEL: withcall: ; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; G_GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; G_GFX9-NEXT: s_mov_b32 s22, -1 -; G_GFX9-NEXT: s_mov_b32 s23, 0xe00000 -; G_GFX9-NEXT: s_add_u32 s20, s20, s9 -; G_GFX9-NEXT: s_addc_u32 s21, s21, 0 -; G_GFX9-NEXT: s_mov_b32 s14, s8 -; G_GFX9-NEXT: s_add_u32 s8, s2, 36 -; G_GFX9-NEXT: s_addc_u32 s9, s3, 0 -; G_GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] -; G_GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; G_GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_mov_b32 s14, -1 +; G_GFX9-NEXT: s_mov_b32 s15, 0xe00000 +; G_GFX9-NEXT: s_add_u32 s12, s12, s3 +; G_GFX9-NEXT: s_addc_u32 s13, s13, 0 +; G_GFX9-NEXT: s_add_u32 s8, s0, 36 +; G_GFX9-NEXT: s_addc_u32 s9, s1, 0 ; G_GFX9-NEXT: s_getpc_b64 s[0:1] ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; G_GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] -; G_GFX9-NEXT: v_mov_b32_e32 v3, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v4, 8 -; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] -; G_GFX9-NEXT: s_mov_b32 s12, s6 -; G_GFX9-NEXT: s_mov_b32 s13, s7 +; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; G_GFX9-NEXT: s_mov_b64 s[0:1], s[12:13] +; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, 8 +; G_GFX9-NEXT: s_mov_b64 s[2:3], s[14:15] ; G_GFX9-NEXT: s_mov_b32 s32, 0 -; G_GFX9-NEXT: ds_write_b32 v4, v3 +; G_GFX9-NEXT: ds_write_b32 v1, v0 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; G_GFX9-NEXT: s_endpgm ; ; G_GFX10-LABEL: withcall: ; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 -; G_GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; G_GFX10-NEXT: s_mov_b32 s22, -1 -; G_GFX10-NEXT: s_mov_b32 s23, 0x31c16000 -; G_GFX10-NEXT: s_add_u32 s20, s20, s9 -; G_GFX10-NEXT: s_addc_u32 s21, s21, 0 -; G_GFX10-NEXT: s_mov_b32 s14, s8 -; G_GFX10-NEXT: s_add_u32 s8, s2, 36 -; G_GFX10-NEXT: s_addc_u32 s9, s3, 0 -; G_GFX10-NEXT: s_mov_b64 s[10:11], s[4:5] -; G_GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] +; G_GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; G_GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; G_GFX10-NEXT: s_mov_b32 s14, -1 +; G_GFX10-NEXT: s_mov_b32 s15, 0x31c16000 +; G_GFX10-NEXT: s_add_u32 s12, s12, s3 +; G_GFX10-NEXT: s_addc_u32 s13, s13, 0 +; G_GFX10-NEXT: s_add_u32 s8, s0, 36 +; G_GFX10-NEXT: s_addc_u32 s9, s1, 0 ; G_GFX10-NEXT: s_getpc_b64 s[0:1] ; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 -; G_GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; G_GFX10-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; G_GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; G_GFX10-NEXT: v_mov_b32_e32 v3, 0 -; G_GFX10-NEXT: v_mov_b32_e32 v4, 8 -; G_GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] -; G_GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] -; G_GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 -; G_GFX10-NEXT: s_mov_b32 s12, s6 -; G_GFX10-NEXT: s_mov_b32 s13, s7 +; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 8 +; G_GFX10-NEXT: s_mov_b64 s[0:1], s[12:13] +; G_GFX10-NEXT: s_mov_b64 s[2:3], s[14:15] ; G_GFX10-NEXT: s_mov_b32 s32, 0 -; G_GFX10-NEXT: ds_write_b32 v4, v3 +; G_GFX10-NEXT: ds_write_b32 v1, v0 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; G_GFX10-NEXT: s_endpgm store i32 0, ptr addrspace(3) @used_by_both call void @nonkernel() diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll index 1c38f8ffc89edc..99120ab4a14249 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -27,9 +27,9 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: add_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -50,9 +50,9 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_imm ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -73,9 +73,9 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_imm_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) @@ -96,9 +96,9 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sub_reg_reg ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll index eb638da3904055..4332d9daeaaf5e 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -23,13 +23,13 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[8:11], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB0_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -73,7 +73,7 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -82,13 +82,13 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_3 ; GCN-NEXT: ; %bb.1: ; %atomic ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:400 -; GCN-NEXT: s_load_dword s2, s[2:3], 0xf +; GCN-NEXT: s_load_dword s2, s[0:1], 0xf ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: .LBB1_2: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll index 90a3d350e7416e..63688ebeab9d0b 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll @@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -23,10 +23,10 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1 ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[8:11], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[2:3], 0xf +; GCN-NEXT: s_load_dword s0, s[0:1], 0xf ; GCN-NEXT: s_mov_b32 s8, s10 ; GCN-NEXT: s_mov_b32 s9, s10 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -58,7 +58,7 @@ exit: define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 { ; GCN-LABEL: atomic_max_i32_noret: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -67,10 +67,10 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[4:7], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %atomic -; GCN-NEXT: s_load_dword s0, s[2:3], 0xf +; GCN-NEXT: s_load_dword s0, s[0:1], 0xf ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index ece7e28c763fb1..9d6e0927b0dfd6 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,7 +8,7 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -45,7 +45,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -87,7 +87,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -125,7 +125,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -168,7 +168,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -205,7 +205,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -249,7 +249,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 @@ -287,7 +287,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll index 4630b0d7ef50ba..1cd9afef13b5e2 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -24,9 +24,9 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: exp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -45,9 +45,9 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -65,9 +65,9 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: log_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -86,9 +86,9 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -106,9 +106,9 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rcp_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -127,9 +127,9 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -147,9 +147,9 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: rsq_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) @@ -168,9 +168,9 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f32 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1) @@ -188,9 +188,9 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) { define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) { ; CHECK-LABEL: name: sqrt_f16 ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 4aed9dc2fca6ca..4ba5f3abcb24b1 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -5,49 +5,49 @@ ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: -; GCN: buffer_store_short v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { store volatile i16 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { store volatile i32 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { store volatile <2 x i32> , ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { store volatile <4 x i32> , ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i8: -; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i8(ptr addrspace(1) %out) #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %sextload = sext i8 %load to i32 @@ -56,7 +56,7 @@ define amdgpu_kernel void @sextload_private_offset_i8(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}zextload_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i8(ptr addrspace(1) %out) #0 { %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %zextload = zext i8 %load to i32 @@ -65,14 +65,14 @@ define amdgpu_kernel void @zextload_private_offset_i8(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i16: -; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i16(ptr addrspace(1) %out) #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %sextload = sext i16 %load to i32 @@ -81,7 +81,7 @@ define amdgpu_kernel void @sextload_private_offset_i16(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}zextload_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i16(ptr addrspace(1) %out) #0 { %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) %zextload = zext i16 %load to i32 @@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(ptr addrspace(1) %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i32: -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { %load = load volatile i32, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { %load = load volatile <2 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[12:15], 0 offset:8 +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { %load = load volatile <4 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5)) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[12:15], 0 offset:4095 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4095 to ptr addrspace(5)) ret void @@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[12:15], 0 offen{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4096 to ptr addrspace(5)) ret void @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[12:15], 0 offen offset:1{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { store volatile i8 5, ptr addrspace(5) inttoptr (i32 4097 to ptr addrspace(5)) ret void diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 0889f8ef6316ed..b4272049f36a4c 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_mul_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: test_mul_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -50,7 +50,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_mul_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -69,7 +69,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: test_mul_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: test_mul_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -109,7 +109,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_mul_v2i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -157,7 +157,7 @@ entry: define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -179,7 +179,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_mul_v4i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -201,7 +201,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_mul_v4i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -223,7 +223,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_mul_v4i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -246,7 +246,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_mul_v4i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -271,7 +271,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX12-LABEL: v_mul_v4i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -326,9 +326,9 @@ entry: define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: s_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -341,9 +341,9 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; VI-LABEL: s_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -356,10 +356,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; GFX9-LABEL: s_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s7, s[2:3], 0x34 -; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 @@ -373,11 +373,11 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX10-LABEL: s_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mul_i32 s0, s0, s6 +; GFX10-NEXT: s_mul_i32 s0, s2, s6 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -386,8 +386,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX11-LABEL: s_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mul_i32 s0, s0, s6 @@ -401,8 +401,8 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-LABEL: s_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mul_i32 s0, s0, s6 @@ -433,98 +433,98 @@ entry: define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_trunc_i64_mul_to_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_trunc_i64_mul_to_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_trunc_i64_mul_to_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_trunc_i64_mul_to_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_trunc_i64_mul_to_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -548,8 +548,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: v_trunc_i64_mul_to_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s14, s10 @@ -603,8 +603,8 @@ entry: define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -617,11 +617,11 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -630,43 +630,43 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_i32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_sext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_i32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 +; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_sext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_i32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -676,7 +676,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -711,8 +711,8 @@ entry: define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x50 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -725,11 +725,11 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x50 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_nop 2 @@ -738,43 +738,43 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX9-LABEL: mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0x50 -; GFX9-NEXT: s_mulk_i32 s4, 0x50 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50 +; GFX9-NEXT: s_mulk_i32 s2, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: mul64_zext_c: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX10-NEXT: s_mul_hi_u32 s3, s4, 0x50 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 +; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: mul64_zext_c: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, 0x50 -; GFX11-NEXT: s_mul_hi_u32 s3, s4, 0x50 +; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 +; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -784,7 +784,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 @@ -818,7 +818,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -838,7 +838,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_sext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -857,7 +857,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_sext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -877,7 +877,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_sext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -896,7 +896,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_sext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -917,7 +917,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -965,7 +965,7 @@ entry: define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_zext_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -985,7 +985,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: v_mul64_zext_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1004,7 +1004,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: v_mul64_zext_c: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX10-LABEL: v_mul64_zext_c: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX11-LABEL: v_mul64_zext_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1064,7 +1064,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: v_mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1112,7 +1112,7 @@ entry: define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul64_sext_inline_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1131,7 +1131,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_mul64_sext_inline_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1149,7 +1149,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_mul64_sext_inline_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1168,7 +1168,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_mul64_sext_inline_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1187,7 +1187,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_mul64_sext_inline_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX12-LABEL: v_mul64_sext_inline_imm: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1256,9 +1256,9 @@ entry: define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: s_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1269,9 +1269,9 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; VI-LABEL: s_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dword s5, s[0:1], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1282,41 +1282,40 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; GFX9-LABEL: s_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s0, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s4, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: s_mov_b32 s2, -1 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-NEXT: s_mul_i32 s0, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s2, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -1327,13 +1326,12 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-LABEL: s_mul_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s3, 0x31016000 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_i32 s2, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s2, s2, s3 +; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -1360,7 +1358,7 @@ entry: define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1378,7 +1376,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_mul_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1396,7 +1394,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_mul_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1414,7 +1412,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX10-LABEL: v_mul_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1432,7 +1430,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX11-LABEL: v_mul_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1452,7 +1450,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX12-LABEL: v_mul_i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1498,9 +1496,9 @@ entry: define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { ; SI-LABEL: s_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dword s5, s[0:1], 0x1c +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1512,9 +1510,9 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; VI-LABEL: s_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x70 -; VI-NEXT: s_load_dword s5, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x70 +; VI-NEXT: s_load_dword s5, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1526,42 +1524,42 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; GFX9-LABEL: s_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x70 -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mul_lo_u16 v0, s4, s5 +; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 @@ -1572,13 +1570,13 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-LABEL: s_mul_i1: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x2 -; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c -; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mul_lo_u16 v0, s4, s5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -1622,7 +1620,7 @@ entry: define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_mul_i1: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1642,7 +1640,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: v_mul_i1: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1662,7 +1660,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: v_mul_i1: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1682,7 +1680,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX10-LABEL: v_mul_i1: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s10, s6 @@ -1703,7 +1701,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX11-LABEL: v_mul_i1: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -1727,7 +1725,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX12-LABEL: v_mul_i1: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s6, -1 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s10, s6 @@ -1795,8 +1793,8 @@ entry: define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1815,8 +1813,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; VI-LABEL: s_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1833,8 +1831,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,18 +1852,18 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX10-LABEL: s_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s1, s6, s1 -; GFX10-NEXT: s_mul_hi_u32 s2, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s2, s1 -; GFX10-NEXT: s_mul_i32 s2, s7, s0 -; GFX10-NEXT: s_mul_i32 s0, s6, s0 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_mul_i32 s0, s6, s3 +; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_add_i32 s0, s1, s0 +; GFX10-NEXT: s_mul_i32 s1, s7, s2 +; GFX10-NEXT: s_mul_i32 s2, s6, s2 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 @@ -1875,8 +1873,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX11-LABEL: s_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s1, s6, s1 @@ -1898,8 +1896,8 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-LABEL: s_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 @@ -1934,21 +1932,21 @@ entry: define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ; SI-LABEL: v_mul_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_lo_u32 v1, v2, v1 ; SI-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -1956,52 +1954,52 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_mul_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v4, v2, v1 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[8:11], 0 +; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_mul_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2009,27 +2007,27 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_mul_i64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31016000 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 -; GFX10-NEXT: s_mov_b32 s14, s10 -; GFX10-NEXT: s_mov_b32 s15, s11 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s10, s2 +; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s5 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 @@ -2037,14 +2035,14 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 @@ -2074,8 +2072,8 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-LABEL: v_mul_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s10, -1 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, s10 @@ -2136,19 +2134,19 @@ entry: define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { ; SI-LABEL: mul32_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mul_i32 s6, s0, s1 +; SI-NEXT: s_mul_i32 s6, s2, s3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc @@ -2171,19 +2169,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul32_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mul_i32 s6, s0, s1 +; VI-NEXT: s_mul_i32 s6, s2, s3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %if @@ -2206,19 +2204,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul32_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s6, s0, s1 +; GFX9-NEXT: s_mul_i32 s6, s2, s3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $sgpr6 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %if @@ -2241,19 +2239,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul32_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s5, s0, s1 +; GFX10-NEXT: s_mul_i32 s5, s2, s3 ; GFX10-NEXT: s_branch .LBB15_3 ; GFX10-NEXT: .LBB15_2: ; GFX10-NEXT: s_mov_b32 s4, -1 ; GFX10-NEXT: ; implicit-def: $sgpr5 ; GFX10-NEXT: .LBB15_3: ; %Flow -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX10-NEXT: ; %bb.4: ; %if @@ -2276,19 +2274,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul32_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s5, s0, s1 +; GFX11-NEXT: s_mul_i32 s5, s2, s3 ; GFX11-NEXT: s_branch .LBB15_3 ; GFX11-NEXT: .LBB15_2: ; GFX11-NEXT: s_mov_b32 s4, -1 ; GFX11-NEXT: ; implicit-def: $sgpr5 ; GFX11-NEXT: .LBB15_3: ; %Flow -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX11-NEXT: ; %bb.4: ; %if @@ -2313,19 +2311,19 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul32_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX12-NEXT: s_mov_b32 s4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX12-NEXT: ; %bb.1: ; %else -; GFX12-NEXT: s_mul_i32 s5, s0, s1 +; GFX12-NEXT: s_mul_i32 s5, s2, s3 ; GFX12-NEXT: s_branch .LBB15_3 ; GFX12-NEXT: .LBB15_2: ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: ; implicit-def: $sgpr5 ; GFX12-NEXT: .LBB15_3: ; %Flow -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX12-NEXT: ; %bb.4: ; %if @@ -2405,7 +2403,7 @@ endif: define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -2440,7 +2438,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2472,7 +2470,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -2508,7 +2506,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2542,7 +2540,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2577,7 +2575,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 @@ -2670,9 +2668,9 @@ endif: define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { ; SI-LABEL: s_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x1f -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2719,9 +2717,9 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; VI-LABEL: s_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2758,96 +2756,96 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 -; GFX9-NEXT: s_add_i32 s7, s12, s7 -; GFX9-NEXT: s_mul_i32 s12, s9, s6 -; GFX9-NEXT: s_add_i32 s7, s7, s12 -; GFX9-NEXT: s_mul_i32 s12, s10, s5 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4 -; GFX9-NEXT: s_add_i32 s12, s13, s12 -; GFX9-NEXT: s_mul_i32 s11, s11, s4 -; GFX9-NEXT: s_mul_i32 s6, s8, s6 -; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_i32 s10, s10, s4 -; GFX9-NEXT: s_add_u32 s10, s10, s6 -; GFX9-NEXT: s_addc_u32 s11, s12, s7 -; GFX9-NEXT: s_mul_i32 s14, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s5, s8 +; GFX9-NEXT: s_mul_i32 s0, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 +; GFX9-NEXT: s_mul_i32 s2, s14, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 +; GFX9-NEXT: s_add_i32 s0, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s13, s10 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_mul_i32 s3, s15, s8 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s12, s10 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_i32 s3, s14, s8 +; GFX9-NEXT: s_add_u32 s3, s3, s1 +; GFX9-NEXT: s_addc_u32 s2, s2, s0 +; GFX9-NEXT: s_mul_i32 s14, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 ; GFX9-NEXT: s_add_u32 s14, s14, s15 -; GFX9-NEXT: s_mul_i32 s7, s4, s9 -; GFX9-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9 -; GFX9-NEXT: s_add_u32 s7, s7, s14 -; GFX9-NEXT: s_addc_u32 s12, s12, 0 -; GFX9-NEXT: s_add_u32 s12, s13, s12 -; GFX9-NEXT: s_addc_u32 s13, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s14, s5, s9 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: s_addc_u32 s9, s14, s13 -; GFX9-NEXT: s_add_u32 s10, s5, s10 -; GFX9-NEXT: s_mul_i32 s4, s4, s8 -; GFX9-NEXT: s_mov_b32 s5, s6 -; GFX9-NEXT: s_addc_u32 s9, s9, s11 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_mul_i32 s1, s8, s13 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 +; GFX9-NEXT: s_add_u32 s1, s1, s14 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_addc_u32 s11, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 +; GFX9-NEXT: s_mul_i32 s9, s9, s13 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s10, s14, s11 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s9, s9, s3 +; GFX9-NEXT: s_addc_u32 s10, s10, s2 +; GFX9-NEXT: s_mul_i32 s2, s8, s12 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_mov_b32 s3, s12 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s13, s2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s2, s8, s7 +; GFX10-NEXT: s_mul_i32 s3, s8, s7 ; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX10-NEXT: s_mul_i32 s14, s10, s5 ; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX10-NEXT: s_mul_i32 s13, s9, s6 +; GFX10-NEXT: s_mul_i32 s12, s9, s6 ; GFX10-NEXT: s_mul_i32 s11, s11, s4 -; GFX10-NEXT: s_add_i32 s2, s7, s2 +; GFX10-NEXT: s_add_i32 s3, s7, s3 ; GFX10-NEXT: s_add_i32 s7, s15, s14 ; GFX10-NEXT: s_mul_i32 s6, s8, s6 ; GFX10-NEXT: s_mul_i32 s10, s10, s4 -; GFX10-NEXT: s_add_i32 s2, s2, s13 +; GFX10-NEXT: s_add_i32 s3, s3, s12 ; GFX10-NEXT: s_add_i32 s7, s7, s11 ; GFX10-NEXT: s_mul_i32 s19, s5, s8 ; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX10-NEXT: s_add_u32 s6, s10, s6 ; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX10-NEXT: s_addc_u32 s7, s7, s2 +; GFX10-NEXT: s_addc_u32 s7, s7, s3 ; GFX10-NEXT: s_mul_i32 s17, s4, s9 -; GFX10-NEXT: s_add_u32 s2, s19, s20 +; GFX10-NEXT: s_add_u32 s3, s19, s20 ; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX10-NEXT: s_mul_i32 s5, s5, s9 ; GFX10-NEXT: s_addc_u32 s9, s18, 0 -; GFX10-NEXT: s_add_u32 s13, s17, s2 +; GFX10-NEXT: s_add_u32 s3, s17, s3 ; GFX10-NEXT: s_addc_u32 s10, s16, 0 -; GFX10-NEXT: s_mul_i32 s2, s4, s8 +; GFX10-NEXT: s_mul_i32 s12, s4, s8 ; GFX10-NEXT: s_add_u32 s4, s9, s10 ; GFX10-NEXT: s_addc_u32 s8, 0, 0 ; GFX10-NEXT: s_add_u32 s4, s5, s4 ; GFX10-NEXT: s_addc_u32 s5, s21, s8 ; GFX10-NEXT: s_add_u32 s4, s4, s6 ; GFX10-NEXT: s_addc_u32 s5, s5, s7 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 @@ -2860,46 +2858,46 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX11-LABEL: s_mul_i128: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x4c -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x7c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s3, s12 +; GFX11-NEXT: s_mov_b32 s13, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s2, s8, s7 +; GFX11-NEXT: s_mul_i32 s3, s8, s7 ; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 ; GFX11-NEXT: s_mul_i32 s14, s10, s5 ; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 -; GFX11-NEXT: s_mul_i32 s13, s9, s6 +; GFX11-NEXT: s_mul_i32 s12, s9, s6 ; GFX11-NEXT: s_mul_i32 s11, s11, s4 -; GFX11-NEXT: s_add_i32 s2, s7, s2 +; GFX11-NEXT: s_add_i32 s3, s7, s3 ; GFX11-NEXT: s_add_i32 s7, s15, s14 ; GFX11-NEXT: s_mul_i32 s6, s8, s6 ; GFX11-NEXT: s_mul_i32 s10, s10, s4 -; GFX11-NEXT: s_add_i32 s2, s2, s13 +; GFX11-NEXT: s_add_i32 s3, s3, s12 ; GFX11-NEXT: s_add_i32 s7, s7, s11 ; GFX11-NEXT: s_mul_i32 s19, s5, s8 ; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 ; GFX11-NEXT: s_add_u32 s6, s10, s6 ; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 -; GFX11-NEXT: s_addc_u32 s7, s7, s2 +; GFX11-NEXT: s_addc_u32 s7, s7, s3 ; GFX11-NEXT: s_mul_i32 s17, s4, s9 -; GFX11-NEXT: s_add_u32 s2, s19, s20 +; GFX11-NEXT: s_add_u32 s3, s19, s20 ; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 ; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 ; GFX11-NEXT: s_mul_i32 s5, s5, s9 ; GFX11-NEXT: s_addc_u32 s9, s18, 0 -; GFX11-NEXT: s_add_u32 s13, s17, s2 +; GFX11-NEXT: s_add_u32 s3, s17, s3 ; GFX11-NEXT: s_addc_u32 s10, s16, 0 -; GFX11-NEXT: s_mul_i32 s2, s4, s8 +; GFX11-NEXT: s_mul_i32 s12, s4, s8 ; GFX11-NEXT: s_add_u32 s4, s9, s10 ; GFX11-NEXT: s_addc_u32 s8, 0, 0 ; GFX11-NEXT: s_add_u32 s4, s5, s4 ; GFX11-NEXT: s_addc_u32 s5, s21, s8 ; GFX11-NEXT: s_add_u32 s4, s4, s6 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] +; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 @@ -2913,40 +2911,40 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-LABEL: s_mul_i128: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x7c -; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x4c -; GFX12-NEXT: s_mov_b32 s13, 0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX12-NEXT: s_mov_b32 s15, s13 -; GFX12-NEXT: s_mov_b32 s3, s13 -; GFX12-NEXT: s_mov_b32 s17, s13 -; GFX12-NEXT: s_mov_b32 s19, s13 -; GFX12-NEXT: s_mov_b32 s24, s13 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c +; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s15, s3 +; GFX12-NEXT: s_mov_b32 s13, s3 +; GFX12-NEXT: s_mov_b32 s17, s3 +; GFX12-NEXT: s_mov_b32 s19, s3 +; GFX12-NEXT: s_mov_b32 s24, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s12, s4 +; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s14, s8 -; GFX12-NEXT: s_mov_b32 s2, s9 -; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13] -; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13] -; GFX12-NEXT: s_mov_b32 s12, s23 +; GFX12-NEXT: s_mov_b32 s12, s9 +; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3] +; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3] +; GFX12-NEXT: s_mov_b32 s2, s23 ; GFX12-NEXT: s_mov_b32 s16, s5 ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] -; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[12:13] +; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3] ; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] ; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] -; GFX12-NEXT: s_mov_b32 s12, s11 -; GFX12-NEXT: s_mov_b32 s11, s13 +; GFX12-NEXT: s_mov_b32 s2, s11 +; GFX12-NEXT: s_mov_b32 s11, s3 ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] -; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[16:17] +; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17] ; GFX12-NEXT: s_mov_b32 s18, s7 +; GFX12-NEXT: s_mov_b32 s23, s3 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] ; GFX12-NEXT: s_mov_b32 s25, s6 -; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19] -; GFX12-NEXT: s_mov_b32 s23, s13 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] -; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25] +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3] +; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 @@ -3013,7 +3011,7 @@ entry: define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { ; SI-LABEL: v_mul_i128: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 @@ -3062,7 +3060,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; VI-LABEL: v_mul_i128: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; VI-NEXT: v_mov_b32_e32 v11, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -3102,7 +3100,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3133,7 +3131,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX10-LABEL: v_mul_i128: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3165,9 +3163,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX11-LABEL: v_mul_i128: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -3205,9 +3201,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 842dc36e001545..357b851a8f56f1 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -100,7 +100,7 @@ entry: define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smulhi24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,7 +126,7 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -274,26 +274,26 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { ; SI-LABEL: test_smul24_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dword s5, s[2:3], 0x1c -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x13 +; SI-NEXT: s_load_dword s0, s[0:1], 0x1c +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; SI-NEXT: s_bfe_i32 s5, s5, 0x180000 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_mul_i32 s4, s5, s4 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_bfe_i32 s1, s2, 0x180000 +; SI-NEXT: s_bfe_i32 s0, s0, 0x180000 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_mul_i32 s1, s0, s1 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dword s5, s[2:3], 0x70 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dword s5, s[0:1], 0x70 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -307,19 +307,19 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 ; ; GFX9-LABEL: test_smul24_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s6, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0 +; GFX9-NEXT: s_mul_i32 s1, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64: @@ -376,8 +376,8 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i64_square: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -390,8 +390,8 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_smul24_i64_square: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -403,17 +403,17 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_smul24_i64_square: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s5, s4, s4 -; GFX9-NEXT: s_mul_i32 s4, s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i64_square: @@ -463,33 +463,33 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 { ; SI-LABEL: test_smul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s4, 8 -; SI-NEXT: s_lshl_b32 s7, s6, 8 -; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_mul_i32 s5, s4, s6 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_lshl_b32 s1, s2, 8 +; SI-NEXT: s_lshl_b32 s3, s0, 8 +; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mul_i32 s1, s0, s2 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dword s5, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s4, 8 -; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_lshl_b32 s3, s2, 8 +; VI-NEXT: s_lshl_b32 s5, s4, 8 ; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 ; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -504,23 +504,23 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_smul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6 -; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_lshl_b32 s1, s2, 8 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; GFX9-NEXT: s_lshl_b32 s1, s3, 8 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 +; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i33: @@ -580,9 +580,9 @@ entry: define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_smulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dword s5, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,9 +594,9 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_smulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -608,20 +608,20 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_smulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s6, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, 8 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GFX9-NEXT: s_lshl_b32 s5, s6, 8 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40 -; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshl_b32 s1, s2, 8 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 +; GFX9-NEXT: s_lshl_b32 s1, s3, 8 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40 +; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i33: @@ -672,15 +672,15 @@ entry: define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) { ; SI-LABEL: simplify_i24_crash: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %bb7 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB8_2: ; %bb11 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 @@ -694,15 +694,15 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; VI-LABEL: simplify_i24_crash: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %bb7 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB8_2: ; %bb11 -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,24 +716,24 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, ; ; GFX9-LABEL: simplify_i24_crash: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x2c +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %bb7 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB8_2: ; %bb11 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s4, 0x180000 -; GFX9-NEXT: s_bfe_i32 s5, s6, 0x180000 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_bfe_i32 s0, s4, 0x180000 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0x180000 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: simplify_i24_crash: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 0c0bb830ba847b..3a16c88f32cc3e 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffffff @@ -24,7 +24,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -64,13 +64,13 @@ entry: define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16_sext: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_sext_i32_i16 s4, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_sext_i32_i16 s4, s2 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -78,8 +78,8 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; VI-LABEL: test_umul24_i16_sext: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,16 +92,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i ; ; GFX9-LABEL: test_umul24_i16_sext: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_mul_i32 s2, s2, s0 +; GFX9-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -113,7 +113,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr_sext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -136,7 +136,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; VI-LABEL: test_umul24_i16_vgpr_sext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: test_umul24_i16_vgpr_sext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -186,13 +186,13 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) { ; SI-LABEL: test_umul24_i16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, 16 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_and_b32 s4, s2, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -200,8 +200,8 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; VI-LABEL: test_umul24_i16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -214,16 +214,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b ; ; GFX9-LABEL: test_umul24_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_mul_i32 s2, s2, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %mul = mul i16 %a, %b @@ -235,7 +235,7 @@ entry: define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: test_umul24_i16_vgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 @@ -258,7 +258,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: test_umul24_i16_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs ; ; GFX9-LABEL: test_umul24_i16_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -307,8 +307,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; SI-LABEL: test_umul24_i8_vgpr: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: v_mov_b32_e32 v4, 0 @@ -330,8 +330,8 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: test_umul24_i8_vgpr: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 @@ -351,11 +351,11 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: test_umul24_i8_vgpr: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v3, v1, s[0:1] +; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -379,7 +379,7 @@ entry: define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi24_i32_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -392,7 +392,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +405,7 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -432,9 +432,9 @@ entry: define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umulhi24: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -447,9 +447,9 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: test_umulhi24: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -462,18 +462,19 @@ define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: test_umulhi24: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i64 %a, 16777215 @@ -489,9 +490,9 @@ entry: define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: test_umul24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[2:3], 0xd +; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 @@ -508,9 +509,9 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; VI-LABEL: test_umul24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s7, s[2:3], 0x34 +; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 @@ -524,20 +525,21 @@ define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b ; ; GFX9-LABEL: test_umul24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, 0xffffff -; GFX9-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s2, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s1, s0 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -580,8 +582,8 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: test_umul24_i64_square: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -594,8 +596,8 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: test_umul24_i64_square: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -606,17 +608,17 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3 ; ; GFX9-LABEL: test_umul24_i64_square: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s5, s4, s4 -; GFX9-NEXT: s_mul_i32 s4, s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0 +; GFX9-NEXT: s_mul_i32 s0, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i64 %a, 40 @@ -629,7 +631,7 @@ entry: define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xffff @@ -645,7 +647,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -661,7 +663,7 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff @@ -683,27 +685,27 @@ entry: define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umul24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dword s5, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s6, s4, 0xffffff -; SI-NEXT: s_and_b32 s7, s5, 0xffffff -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 -; SI-NEXT: s_mul_i32 s6, s6, s7 +; SI-NEXT: s_and_b32 s1, s2, 0xffffff +; SI-NEXT: s_and_b32 s3, s0, 0xffffff +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; SI-NEXT: s_mul_i32 s1, s1, s3 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -716,20 +718,20 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff -; GFX9-NEXT: s_mul_i32 s6, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s2, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 @@ -745,9 +747,9 @@ entry: define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) { ; SI-LABEL: test_umulhi24_i33: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dword s5, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dword s5, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -759,9 +761,9 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; VI-LABEL: test_umulhi24_i33: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dword s5, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -773,18 +775,18 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dword s5, s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s4, 0xffffff -; GFX9-NEXT: s_and_b32 s5, s5, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm entry: %tmp0 = shl i33 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 727b607e7ded06..16de2c0c6de08c 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -163,7 +163,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; ; GCN-LABEL: multi_if_break_loop: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s2, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll index 517f5738c9fc34..c72c65a6d62e18 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll @@ -27,19 +27,19 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND @@ -50,7 +50,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 -; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -62,20 +62,15 @@ bb: define amdgpu_kernel void @kernel_call() { ; CHECK-LABEL: kernel_call: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_callee@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_callee@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_callee() @@ -87,23 +82,23 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 { ; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: v_writelane_b32 v1, s33, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0 -; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: s_setpc_b64 s[16:17] +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_setpc_b64 s[4:5] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() tail call fastcc void @callee_has_fp() @@ -113,20 +108,15 @@ bb: define amdgpu_kernel void @kernel_tailcall() { ; CHECK-LABEL: kernel_tailcall: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15 -; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; CHECK-NEXT: s_add_u32 s0, s0, s15 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 -; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee() diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index 296d484e247d6e..f6e3509eb029b1 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocapture %arg) #0 { ; GCN-LABEL: reduced_nested_loop_conditions: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dword s0, s[2:3], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_mov_b32 s2, 0 @@ -93,6 +93,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap ; IR: bb23: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; IR-NEXT: ret void +; bb: %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp @@ -276,6 +277,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; IR-NEXT: store volatile i32 0, ptr addrspace(1) undef, align 4 ; IR-NEXT: ret void +; bb: %my.tmp1134 = load volatile i32, ptr addrspace(1) undef %my.tmp1235 = icmp slt i32 %my.tmp1134, 9 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 2ccdf5bebff7b5..150d051a422a8a 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -120,61 +120,61 @@ bb.2: ; ASSUME1024: ; ScratchSize: 1040 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { -; DEFAULTSIZE-V5-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: -; DEFAULTSIZE-V5: ; %bb.0: ; %entry -; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x8 -; DEFAULTSIZE-V5-NEXT: s_add_u32 s0, s0, s15 -; DEFAULTSIZE-V5-NEXT: s_addc_u32 s1, s1, 0 -; DEFAULTSIZE-V5-NEXT: s_mov_b32 s33, 0 -; DEFAULTSIZE-V5-NEXT: s_movk_i32 s32, 0x1000 -; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) -; DEFAULTSIZE-V5-NEXT: s_cmp_lg_u32 s4, 0 -; DEFAULTSIZE-V5-NEXT: s_cbranch_scc1 .LBB1_2 -; DEFAULTSIZE-V5-NEXT: ; %bb.1: ; %bb.0 -; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s32, 0x1000 -; DEFAULTSIZE-V5-NEXT: s_and_b32 s4, s4, 0xfffff000 -; DEFAULTSIZE-V5-NEXT: s_lshl_b32 s5, s5, 2 -; DEFAULTSIZE-V5-NEXT: s_mov_b32 s32, s4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v1, 0 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v3, 1 -; DEFAULTSIZE-V5-NEXT: s_add_i32 s4, s4, s5 -; DEFAULTSIZE-V5-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; DEFAULTSIZE-V5-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v2, s4 -; DEFAULTSIZE-V5-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; DEFAULTSIZE-V5-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) -; DEFAULTSIZE-V5-NEXT: v_add_u32_e32 v0, v2, v0 -; DEFAULTSIZE-V5-NEXT: s_waitcnt lgkmcnt(0) -; DEFAULTSIZE-V5-NEXT: global_store_dword v1, v0, s[4:5] -; DEFAULTSIZE-V5-NEXT: .LBB1_2: ; %bb.1 -; DEFAULTSIZE-V5-NEXT: v_mov_b32_e32 v0, 0 -; DEFAULTSIZE-V5-NEXT: global_store_dword v[0:1], v0, off -; DEFAULTSIZE-V5-NEXT: s_waitcnt vmcnt(0) -; DEFAULTSIZE-V5-NEXT: s_endpgm +; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; MUBUF: ; %bb.0: ; %entry +; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; MUBUF-NEXT: s_add_u32 s0, s0, s9 +; MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; MUBUF-NEXT: s_mov_b32 s33, 0 +; MUBUF-NEXT: s_movk_i32 s32, 0x1000 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: s_cmp_lg_u32 s6, 0 +; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2 +; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 +; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 +; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 +; MUBUF-NEXT: s_mov_b32 s32, s6 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: v_mov_b32_e32 v3, 1 +; MUBUF-NEXT: s_add_i32 s6, s6, s7 +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] +; MUBUF-NEXT: .LBB1_2: ; %bb.1 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: global_store_dword v[0:1], v0, off +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 ; FLATSCR-NEXT: s_mov_b32 s33, 0 ; FLATSCR-NEXT: s_mov_b32 s32, 64 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0 +; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 -; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000 +; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s0 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 -; FLATSCR-NEXT: s_add_i32 s0, s0, s1 -; FLATSCR-NEXT: scratch_load_dword v2, off, s0 -; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 +; FLATSCR-NEXT: s_mov_b32 s32, s2 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 +; FLATSCR-NEXT: s_add_i32 s2, s2, s3 +; FLATSCR-NEXT: scratch_load_dword v2, off, s2 +; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) @@ -406,6 +406,3 @@ attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amd !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; ASSUME1024: {{.*}} -; DEFAULTSIZE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 5c09d2bd61a399..9ab3eccd986a53 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -2104,7 +2104,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2115,7 +2115,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2128,7 +2128,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc @@ -2138,7 +2138,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS @@ -2154,7 +2154,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2165,7 +2165,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2178,7 +2178,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc @@ -2188,7 +2188,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS @@ -2204,7 +2204,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2215,7 +2215,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2228,7 +2228,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -2238,7 +2238,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -2254,7 +2254,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2280,7 +2280,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2292,7 +2292,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -2302,7 +2302,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2315,7 +2315,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2334,7 +2334,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2347,7 +2347,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2360,7 +2360,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2372,7 +2372,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS @@ -2382,7 +2382,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2395,7 +2395,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2414,7 +2414,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2427,7 +2427,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2440,7 +2440,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2452,7 +2452,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2462,7 +2462,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2475,7 +2475,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2494,7 +2494,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2520,7 +2520,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2532,7 +2532,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2555,7 +2555,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2574,7 +2574,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2585,7 +2585,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2598,7 +2598,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc @@ -2608,7 +2608,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS @@ -2624,7 +2624,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2637,7 +2637,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2650,7 +2650,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2662,7 +2662,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS @@ -2672,7 +2672,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2685,7 +2685,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2704,7 +2704,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2717,7 +2717,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0 @@ -2730,7 +2730,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2742,7 +2742,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS @@ -2752,7 +2752,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2765,7 +2765,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0 @@ -2784,7 +2784,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2797,7 +2797,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2810,7 +2810,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2822,7 +2822,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS @@ -2832,7 +2832,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2845,7 +2845,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2864,7 +2864,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2877,7 +2877,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2890,7 +2890,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2902,7 +2902,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS @@ -2912,7 +2912,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2925,7 +2925,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2944,7 +2944,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -2957,7 +2957,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-NEXT: s_addc_u32 s1, s1, -1 @@ -2970,7 +2970,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2982,7 +2982,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS @@ -2992,7 +2992,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -3005,7 +3005,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -3025,7 +3025,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3037,7 +3037,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3050,7 +3050,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3062,7 +3062,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3074,7 +3074,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3100,7 +3100,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3120,7 +3120,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3132,7 +3132,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3145,7 +3145,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3157,7 +3157,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3169,7 +3169,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3182,7 +3182,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3195,7 +3195,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3215,7 +3215,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 @@ -3227,7 +3227,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3240,7 +3240,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3252,7 +3252,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3264,7 +3264,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3277,7 +3277,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3290,7 +3290,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3310,7 +3310,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3323,7 +3323,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3336,7 +3336,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3348,7 +3348,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3360,7 +3360,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3373,7 +3373,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3386,7 +3386,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3406,7 +3406,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3419,7 +3419,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3432,7 +3432,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3456,7 +3456,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3469,7 +3469,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3482,7 +3482,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3502,7 +3502,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1 @@ -3515,7 +3515,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 2 @@ -3528,7 +3528,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3540,7 +3540,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3552,7 +3552,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3565,7 +3565,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3578,7 +3578,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3598,7 +3598,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3612,7 +3612,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3625,7 +3625,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0 @@ -3638,7 +3638,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3651,7 +3651,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3664,7 +3664,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3677,7 +3677,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3697,7 +3697,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3711,7 +3711,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3724,7 +3724,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 @@ -3737,7 +3737,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3750,7 +3750,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3763,7 +3763,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3776,7 +3776,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3796,7 +3796,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3810,7 +3810,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3823,7 +3823,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0 @@ -3836,7 +3836,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3849,7 +3849,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3862,7 +3862,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3875,7 +3875,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3895,7 +3895,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -3909,7 +3909,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3922,7 +3922,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 @@ -3935,7 +3935,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -3948,7 +3948,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3961,7 +3961,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -3974,7 +3974,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -3994,7 +3994,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -4008,7 +4008,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4021,7 +4021,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0 @@ -4034,7 +4034,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -4047,7 +4047,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4060,7 +4060,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4073,7 +4073,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 @@ -4093,7 +4093,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -4107,7 +4107,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4120,7 +4120,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 @@ -4133,7 +4133,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0 @@ -4146,7 +4146,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4159,7 +4159,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000 @@ -4172,7 +4172,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; ; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index b5b8213bcd57ee..10381bc21ecc96 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -2176,7 +2176,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc @@ -2186,7 +2186,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX10-LABEL: global_inst_salu_offset_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc @@ -2196,7 +2196,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX11-LABEL: global_inst_salu_offset_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc @@ -2208,7 +2208,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { ; ; GFX12-LABEL: global_inst_salu_offset_1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS @@ -2226,7 +2226,7 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) { define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc @@ -2236,7 +2236,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2246,7 +2246,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc @@ -2258,7 +2258,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS @@ -2276,7 +2276,7 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2286,7 +2286,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2296,7 +2296,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2308,7 +2308,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2336,7 +2336,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX10-LABEL: global_inst_salu_offset_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2346,7 +2346,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX11-LABEL: global_inst_salu_offset_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2358,7 +2358,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p ; ; GFX12-LABEL: global_inst_salu_offset_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2376,7 +2376,7 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc @@ -2386,7 +2386,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc @@ -2396,7 +2396,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc @@ -2408,7 +2408,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS @@ -2426,7 +2426,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2436,7 +2436,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2449,7 +2449,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2461,7 +2461,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2473,7 +2473,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2490,7 +2490,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2502,7 +2502,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2515,7 +2515,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2530,7 +2530,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2542,7 +2542,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2553,7 +2553,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1 define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2593,7 +2593,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2605,7 +2605,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS @@ -2623,7 +2623,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2633,7 +2633,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2643,7 +2643,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2655,7 +2655,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS @@ -2673,7 +2673,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc @@ -2683,7 +2683,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc @@ -2693,7 +2693,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc @@ -2705,7 +2705,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) ; ; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS @@ -2723,7 +2723,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc @@ -2733,7 +2733,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2746,7 +2746,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS @@ -2770,7 +2770,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2787,7 +2787,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000 @@ -2799,7 +2799,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2812,7 +2812,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2827,7 +2827,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS @@ -2839,7 +2839,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2850,7 +2850,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2870,7 +2870,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000 @@ -2882,7 +2882,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2895,7 +2895,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1 @@ -2910,7 +2910,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS @@ -2922,7 +2922,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 @@ -2933,7 +2933,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2954,7 +2954,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -2966,7 +2966,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -2994,7 +2994,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3009,7 +3009,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3020,7 +3020,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3034,7 +3034,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3055,7 +3055,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3080,7 +3080,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3095,7 +3095,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3121,7 +3121,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3135,7 +3135,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3156,7 +3156,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -3168,7 +3168,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3181,7 +3181,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3196,7 +3196,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3211,7 +3211,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3222,7 +3222,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3236,7 +3236,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3257,7 +3257,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3269,7 +3269,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3282,7 +3282,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3297,7 +3297,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3312,7 +3312,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3323,7 +3323,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3337,7 +3337,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3358,7 +3358,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3370,7 +3370,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3383,7 +3383,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3398,7 +3398,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3413,7 +3413,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3424,7 +3424,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3438,7 +3438,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3459,7 +3459,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3471,7 +3471,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3484,7 +3484,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000 ; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2 @@ -3499,7 +3499,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 ; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2 @@ -3514,7 +3514,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 @@ -3525,7 +3525,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3539,7 +3539,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3560,7 +3560,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3572,7 +3572,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3584,7 +3584,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff @@ -3598,7 +3598,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff @@ -3612,7 +3612,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3634,7 +3634,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x800 @@ -3646,7 +3646,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x800 @@ -3658,7 +3658,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x800 @@ -3672,7 +3672,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800 @@ -3686,7 +3686,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3708,7 +3708,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0xfff @@ -3720,7 +3720,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0xfff @@ -3732,7 +3732,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0xfff @@ -3746,7 +3746,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff @@ -3760,7 +3760,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3782,7 +3782,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3794,7 +3794,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3806,7 +3806,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1000 @@ -3820,7 +3820,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000 @@ -3834,7 +3834,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3856,7 +3856,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3868,7 +3868,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3880,7 +3880,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff @@ -3894,7 +3894,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff @@ -3908,7 +3908,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 @@ -3930,7 +3930,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) { ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3942,7 +3942,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3954,7 +3954,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s0, s0, 0x2000 @@ -3968,7 +3968,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000 @@ -3982,7 +3982,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p ; ; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000 ; GFX12-SDAG-NEXT: s_brev_b32 s3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index df15f98ae27ff6..769d035858ca83 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -25,7 +25,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -43,14 +43,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -59,14 +58,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -86,7 +84,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -103,7 +101,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -121,14 +119,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -137,14 +134,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 @@ -164,7 +160,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 { ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -181,7 +177,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -199,14 +195,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -215,14 +210,13 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_nop 0 @@ -242,7 +236,7 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 { ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -259,7 +253,7 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -277,14 +271,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 @@ -293,14 +286,13 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out ; ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1] ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll index d73b1bd29c9813..bd7f9014d55cae 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll @@ -4,14 +4,14 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 0 -; GCN-NEXT: s_cselect_b32 s2, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_cselect_b32 s0, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 1 %cmp = icmp eq i32 %and, 0 @@ -23,14 +23,14 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_1024: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 10 -; GCN-NEXT: s_cselect_b32 s2, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_cselect_b32 s0, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 1024 %cmp = icmp eq i32 %and, 0 @@ -42,14 +42,14 @@ define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p) { define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x80000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp0_b32 s4, 31 -; GCN-NEXT: s_cselect_b32 s2, 22, 33 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_dword v0, v1, s[0:1] +; GCN-NEXT: s_cselect_b32 s0, 22, 33 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %and = and i32 %arg, 2147483648 %cmp = icmp eq i32 %and, 0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p) define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) { ; GCN-LABEL: if_masked_0x8000000000000000: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 4ee2b8e981f449..a50a0766f67c2c 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -92,7 +92,7 @@ bb4: define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) { ; GCN-LABEL: negated_cond_dominated_blocks: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index eff80236d98663..0473f803bfb30e 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -70,7 +70,7 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX8-LABEL: or_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: scalar_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -185,40 +185,40 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { ; GFX6-LABEL: vector_or_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dword s12, s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s12, s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dword s12, s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s12, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i32: @@ -246,8 +246,8 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: scalar_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -258,8 +258,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; ; GFX8-LABEL: scalar_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -286,8 +286,8 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_literal_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -300,8 +300,8 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; ; GFX8-LABEL: scalar_or_literal_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -332,43 +332,43 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: scalar_or_literal_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d ; GFX6-NEXT: s_movk_i32 s8, 0x3039 ; GFX6-NEXT: s_mov_b32 s9, 0xf237b -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_add_u32 s0, s6, 0x3039 -; GFX6-NEXT: s_addc_u32 s1, s7, 0xf237b +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_add_u32 s0, s0, 0x3039 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_addc_u32 s1, s1, 0xf237b ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_literal_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 ; GFX8-NEXT: s_movk_i32 s8, 0x3039 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_add_u32 s0, s2, 0x3039 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_add_u32 s0, s0, 0x3039 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0xf237b +; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -408,8 +408,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -421,8 +421,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; GFX8-LABEL: scalar_or_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -451,44 +451,44 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_or_b32 s2, s6, 63 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_or_b32 s4, s6, 63 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: s_add_u32 s0, s0, 63 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_add_u32 s0, s8, 63 +; GFX6-NEXT: s_addc_u32 s1, s9, 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_or_b32 s2, s6, 63 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_or_b32 s4, s6, 63 +; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_add_u32 s0, s0, 63 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_add_u32 s0, s8, 63 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm ; @@ -521,8 +521,8 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; GFX6-LABEL: scalar_or_neg_inline_imm_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v1, -1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; ; GFX8-LABEL: scalar_or_neg_inline_imm_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 @@ -565,7 +565,7 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_literal_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -583,7 +583,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_literal_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -624,7 +624,7 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_inline_immediate_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -642,7 +642,7 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; ; GFX8-LABEL: vector_or_inline_immediate_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -683,8 +683,8 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX6-LABEL: scalar_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -698,8 +698,8 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX8-LABEL: scalar_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -730,48 +730,48 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s14, s2 +; GFX6-NEXT: s_mov_b32 s15, s3 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s14, s2 +; GFX8-NEXT: s_mov_b32 s15, s3 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: vector_or_i64: @@ -803,42 +803,42 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { ; GFX6-LABEL: scalar_vector_or_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s6 -; GFX6-NEXT: s_mov_b32 s1, s7 -; GFX6-NEXT: s_mov_b32 s3, s11 -; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 +; GFX6-NEXT: s_mov_b32 s11, s3 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: scalar_vector_or_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s0, s6 -; GFX8-NEXT: s_mov_b32 s1, s7 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 +; GFX8-NEXT: s_mov_b32 s11, s3 +; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_vector_or_i64: @@ -867,7 +867,7 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_loadimm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -886,7 +886,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; ; GFX8-LABEL: vector_or_i64_loadimm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -931,7 +931,7 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -949,7 +949,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; ; GFX8-LABEL: vector_or_i64_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -990,7 +990,7 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_inline_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1009,7 +1009,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; ; GFX8-LABEL: vector_or_i64_neg_inline_imm: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1053,7 +1053,7 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; GFX6-LABEL: vector_or_i64_neg_literal: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; ; GFX8-LABEL: vector_or_i64_neg_literal: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_mov_b32 s10, s6 @@ -1116,9 +1116,9 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; GFX6-LABEL: trunc_i64_or_to_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x13 -; GFX6-NEXT: s_load_dword s5, s[2:3], 0x1d -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x13 +; GFX6-NEXT: s_load_dword s5, s[0:1], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,9 +1129,9 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; ; GFX8-LABEL: trunc_i64_or_to_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x4c -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,21 +1159,21 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; GFX6-LABEL: or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s14, s2 +; GFX6-NEXT: s_mov_b32 s15, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(1) ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1181,26 +1181,26 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s14, s2 +; GFX8-NEXT: s_mov_b32 s15, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1208,7 +1208,7 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: or_i1: @@ -1244,8 +1244,8 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GFX6-LABEL: s_or_i1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,8 +1260,8 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; ; GFX8-LABEL: s_or_i1: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index 5792fab7011afe..e21b93a386c3e7 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX8-LABEL: s_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -64,7 +64,7 @@ define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -89,7 +89,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -113,7 +113,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -125,7 +125,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX8-LABEL: s_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -138,7 +138,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -162,7 +162,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -178,7 +178,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX8-LABEL: v_pack_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -200,7 +200,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -240,7 +240,7 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -258,7 +258,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX8-LABEL: v_pack_v2f16_user: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -282,7 +282,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2f16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -324,7 +324,7 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -339,7 +339,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -356,7 +356,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -386,7 +386,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -418,7 +418,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -448,7 +448,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -463,7 +463,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX8-LABEL: v_pack_v2f16_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -480,7 +480,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2f16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -510,7 +510,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -525,7 +525,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX8-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -542,7 +542,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; ; GFX7-LABEL: v_pack_v2f16_inline_f16imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -572,7 +572,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -586,7 +586,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX8-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -603,7 +603,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2f16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 529e64715500dd..4b21493bd7ca66 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX803-LABEL: s_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -35,7 +35,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) ; ; GFX7-LABEL: s_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 @@ -62,7 +62,7 @@ define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -74,7 +74,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -87,7 +87,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 { define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; GFX9-LABEL: s_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -122,7 +122,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX803-LABEL: s_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -135,7 +135,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { ; ; GFX7-LABEL: s_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -158,7 +158,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX803-LABEL: v_pack_v2i16: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -196,7 +196,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; ; GFX7-LABEL: v_pack_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -234,7 +234,7 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_user: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -252,7 +252,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX803-LABEL: v_pack_v2i16_user: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -276,7 +276,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; ; GFX7-LABEL: v_pack_v2i16_user: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0x100f000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -316,7 +316,7 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -331,7 +331,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -348,7 +348,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -377,7 +377,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -391,7 +391,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -408,7 +408,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -437,7 +437,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -452,7 +452,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX803-LABEL: v_pack_v2i16_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -469,7 +469,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; ; GFX7-LABEL: v_pack_v2i16_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -498,7 +498,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX9-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +512,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX803-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX803: ; %bb.0: -; GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -529,7 +529,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; ; GFX7-LABEL: v_pack_v2i16_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s3, 0x100f000 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index c72a7ba3eee834..a3f7906a05f6b1 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -803,5 +803,5 @@ bb: declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index d9ae218551c644..3b00dee1f61f9c 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -108,4 +108,4 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) -attributes #0 = { nounwind "amdgpu-num-vgpr"="5" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind "amdgpu-num-vgpr"="5" } diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll index 560f0a06798102..8d180e7d33f84f 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -97,7 +97,7 @@ define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) { define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) { ; GCN-LABEL: s_trunc_srl_i64_16_to_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s0, 4 diff --git a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll index 8f450e5bcb83f3..031a46271f2c0e 100644 --- a/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/permlane-op-sel.ll @@ -4,10 +4,10 @@ declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) ; OBJ-LABEL: : -; OBJ: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] +; OBJ: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; ASM-LABEL: permlane_op_sel: -; ASM: v_permlane16_b32 v0, v0, s5, s6 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x0b,0x18,0x00] +; ASM: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; encoding: [0x00,0x08,0x77,0xd7,0x00,0x0f,0x00,0x00] define amdgpu_kernel void @permlane_op_sel(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0) store i32 %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll index 4ae0547d11fff3..caa7fb8df19904 100644 --- a/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll +++ b/llvm/test/CodeGen/AMDGPU/permlane16_opsel.ll @@ -8,7 +8,6 @@ declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workitem.id.y() - define amdgpu_kernel void @v_permlane16_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec @@ -46,9 +45,8 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 } define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { -; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec -; SDAG-GFX11: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec -; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) store i32 %v, ptr addrspace(1) %out @@ -126,8 +124,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src } define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false) @@ -170,8 +167,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(ptr addrspace(1) %out, i3 } define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -180,8 +176,7 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 % } define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -191,8 +186,7 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -201,8 +195,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr } define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -212,8 +205,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 0, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 0, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -223,8 +215,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANE16_B32_e64 4, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANE16_B32_e64 4, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -234,8 +225,7 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i } define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}(s32), 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -244,8 +234,7 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -255,8 +244,7 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3 } define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false) @@ -265,8 +253,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s } define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 0, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -276,8 +263,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 0, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 0, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison @@ -287,8 +273,7 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32 } define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { - ; SDAG-GFX10: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec - ; SDAG-GFX11: V_PERMLANEX16_B32_e64 4, killed {{%[0-9]+}}, 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec + ; SDAG: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}(s32), 4, killed {{%[0-9]+}}, 0, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec ; GISEL: V_PERMLANEX16_B32_e64 4, {{%[0-9]+}}, 4, {{%[0-9]+}}, 0, {{%[0-9]+}}, {{%[0-9]+}}, 0, implicit $exec %tidx = call i32 @llvm.amdgcn.workitem.id.x() %undef = freeze i32 poison diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index 69ddc9a48dbc43..6cab2b18393070 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -4,17 +4,17 @@ define amdgpu_kernel void @lsh8_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x6050400 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -31,17 +31,17 @@ bb: define amdgpu_kernel void @lsr24_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsr24_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 +; GCN-NEXT: v_perm_b32 v2, s0, v2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -58,17 +58,17 @@ bb: define amdgpu_kernel void @and_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7060503 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -87,17 +87,17 @@ bb: define amdgpu_kernel void @and_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020500 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -115,17 +115,17 @@ bb: define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh8_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 +; GCN-NEXT: v_perm_b32 v2, s0, v2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -142,17 +142,17 @@ bb: define amdgpu_kernel void @lsh16_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: lsh16_or_lsr24: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x5040c03 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -169,17 +169,17 @@ bb: define amdgpu_kernel void @and_xor_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_xor_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -197,15 +197,15 @@ bb: define amdgpu_kernel void @and_or_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_or_and: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_and_b32 s0, s2, 0xff00 +; GCN-NEXT: s_and_b32 s0, s0, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 @@ -227,17 +227,17 @@ bb: define amdgpu_kernel void @and_or_and_shl(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_and_shl: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x50c0c00 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -255,17 +255,17 @@ bb: define amdgpu_kernel void @or_and_or(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: or_and_or: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -283,20 +283,20 @@ bb: define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff0500: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_bitset1_b32 s2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_and_b32 s0, s2, 0xff00 +; GCN-NEXT: s_bitset1_b32 s0, 15 +; GCN-NEXT: s_and_b32 s0, s0, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 @@ -323,21 +323,21 @@ bb: define amdgpu_kernel void @known_050c0c00(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_050c0c00: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0x50c0c00 ; GCN-NEXT: v_mov_b32_e32 v6, 4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_or_b32 s2, s2, 4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_or_b32 s0, s0, 4 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v4, v4, s2, v5 +; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm @@ -359,22 +359,22 @@ bb: define amdgpu_kernel void @known_ffff8004(ptr addrspace(1) nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff8004: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_load_dword s2, s[2:3], 0x2c +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff0500 ; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] -; GCN-NEXT: s_or_b32 s2, s2, 4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_or_b32 s0, s0, 4 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_perm_b32 v4, v4, s2, v5 +; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index bf98af33dc7b08..048a7756a7a048 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -609,53 +609,53 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i ; GFX10-LABEL: shuffle8i8: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 ; GFX10-NEXT: s_lshr_b32 s4, s9, 16 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9 ; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: v_or_b32_sdwa v0, s3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: shuffle8i8: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s3, 8 +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9 -; GFX9-NEXT: v_or_b32_sdwa v4, s3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s3, s9, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s1, s9, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s3 -; GFX9-NEXT: v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s1 +; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm bb: %vec0 = load <8 x i8>, ptr addrspace(1) %in0 diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll index f53ca53518a172..4794c296215253 100644 --- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll +++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 { ; GCN-LABEL: dbg_clause: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll index 5a03381447d0eb..a030f86da1b67d 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -8,7 +8,7 @@ ; NON-HSA: s_endpgm ; ASM: .fill 63, 4, 0xbf800000 ; s_nop 0 ; OBJ-COUNT-63: s_nop 0 -define amdgpu_kernel void @preload_kernarg_header(ptr inreg %arg) { +define amdgpu_kernel void @preload_kernarg_header(ptr %arg) { store ptr %arg, ptr %arg ret void } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index a547c258e3921d..e076df97e1ba49 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -1,18 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s -define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8: +define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -23,51 +19,27 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i8: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i8: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i8: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i8: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -78,56 +50,32 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 { ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i8: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i8: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -138,51 +86,29 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -193,56 +119,34 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -253,51 +157,27 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xffff -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -308,56 +188,32 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -367,47 +223,25 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -417,52 +251,30 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i32 %arg0, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x0 @@ -474,55 +286,29 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s5, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s5, s4 +; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s2, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s5, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s5, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0 @@ -534,60 +320,34 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s6, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s3, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %add = add i32 %arg0, %arg1 store i32 %add, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -600,59 +360,33 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-1-NEXT: s_and_b32 s1, s4, 0xffff -; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 ; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16 -; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff -; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s1, s0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -665,56 +399,30 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 +; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s3, s2, 16 -; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s2, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -723,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ret void } -define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -734,47 +442,29 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-2-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -784,52 +474,34 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { -; GFX940-NO-PRELOAD-LABEL: byref_preload_arg: +define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { +; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -843,63 +515,37 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: byref_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: byref_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: byref_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: byref_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 @@ -913,59 +559,33 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0) ; GFX90a-PRELOAD-8-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac } -define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i32_arg: +define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 @@ -995,83 +615,47 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v8i32_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_nop 1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v8i32_arg: -; GFX940-PRELOAD-2: ; %bb.0: +; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v8i32_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_nop 1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v8i32_arg: -; GFX940-PRELOAD-8: ; %bb.0: +; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i32_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -1090,87 +674,51 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v8i32_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v8i32_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v8i32_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v8i32_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg: +define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1181,51 +729,29 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1236,55 +762,33 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i16> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg: +define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -1296,55 +800,29 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -1356,59 +834,33 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x i32> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg: +define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -1420,55 +872,29 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 ; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 @@ -1480,59 +906,33 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <3 x float> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg: +define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1543,51 +943,43 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-2-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 @@ -1598,55 +990,47 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 -; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v5f64_arg: +define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -1668,95 +1052,53 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v5f64_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_nop 1 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v5f64_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_nop 1 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v5f64_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_nop 1 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v5f64_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60 +; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 -; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[12:13] ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_nop 1 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v5f64_arg: +; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -1778,99 +1120,57 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v5f64_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v5f64_arg: -; GFX90a-PRELOAD-2: ; %bb.0: +; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-2-NEXT: s_nop 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v5f64_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v5f64_arg: -; GFX90a-PRELOAD-8: ; %bb.0: +; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 ; GFX90a-PRELOAD-8-NEXT: s_nop 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void } -define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { -; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg: +define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: ; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1879,43 +1179,57 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; -; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; -; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; -; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg: +; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: ; GFX90a-NO-PRELOAD: ; %bb.0: ; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 @@ -1924,40 +1238,52 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; -; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; -; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store <8 x i8> %in, ptr addrspace(1) %out ret void @@ -1974,44 +1300,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg: @@ -2024,44 +1328,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store i64 %a, ptr addrspace(1) %out, align 8 ret void @@ -2078,44 +1360,22 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; -; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-1: ; %bb.0: -; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-1-NEXT: s_endpgm -; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-2: ; %bb.0: -; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; -; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-4: ; %bb.0: -; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 -; GFX940-PRELOAD-4-NEXT: s_endpgm -; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-8: ; %bb.0: -; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-PRELOAD-8-NEXT: s_endpgm ; ; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg: @@ -2128,47 +1388,1137 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; -; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-1: ; %bb.0: -; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-1-NEXT: s_endpgm -; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: ; %bb.0: -; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; -; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-4: ; %bb.0: -; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX90a-PRELOAD-4-NEXT: s_endpgm -; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: ; %bb.0: -; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2 -; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3 -; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX90a-PRELOAD-8-NEXT: s_endpgm store double %in, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) #0 { +; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store half %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) #0 { +; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store bfloat %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <2 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <3 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <6 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s10, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s7 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s11 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s10, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[10:11] offset:12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[0:1] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store half %in, ptr addrspace(1) %out + store <7 x bfloat> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) #0 { +; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i1 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) #0 { +; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s13 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store fp128 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s5 +; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s5 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9 +; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 +; GFX90a-PRELOAD-2-NEXT: global_store_short v2, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6 +; GFX90a-PRELOAD-8-NEXT: global_store_short v2, v1, s[6:7] offset:4 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v2, v0, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <7 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) #0 { +; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v1, s[6:7] offset:12 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store <7 x half> %in, ptr addrspace(1) %out + ret void +} + +; Test when previous argument was not dword aligned. +define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0xc +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s3 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store i32 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: s_load_dword s7, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-PRELOAD-2-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-PRELOAD-8-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s3 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s8 +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s8 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store <3 x i32> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store i16 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) #0 { +; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-NO-PRELOAD: ; %bb.0: +; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: s_endpgm +; +; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-2-NEXT: ; %bb.0: +; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8 +; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 +; GFX940-PRELOAD-2-NEXT: s_endpgm +; +; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX940-PRELOAD-8-NEXT: ; %bb.0: +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24 +; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1 +; GFX940-PRELOAD-8-NEXT: s_endpgm +; +; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-NO-PRELOAD: ; %bb.0: +; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6 +; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GFX90a-NO-PRELOAD-NEXT: s_endpgm +; +; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-2-NEXT: ; %bb.0: +; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2 +; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX90a-PRELOAD-2-NEXT: s_endpgm +; +; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg: +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 +; GFX90a-PRELOAD-8-NEXT: ; %bb.0: +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8 +; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v2, s[6:7] +; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[10:11] +; GFX90a-PRELOAD-8-NEXT: s_endpgm + store i16 %in, ptr addrspace(1) %out + store <2 x i8> %in2, ptr addrspace(1) %out2 + ret void +} + +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index 0d88466fc31b3e..6fdc0d5834ef6e 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -513,8 +513,8 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addr ; ; GCN-LABEL: alloca_promote_atomicrmw_private_lds_promote: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -555,8 +555,8 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out, ; ; GCN-LABEL: alloca_promote_cmpxchg_private: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index cf7efed46cef55..b6afb7cf8c9a11 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -14,13 +14,13 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -95,13 +95,13 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -165,14 +165,14 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -233,15 +233,15 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: clmem_read_simplified: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -346,13 +346,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -473,13 +473,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s9 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -589,14 +589,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -701,13 +701,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s9 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -811,15 +811,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: clmem_read: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0 ; GFX11-NEXT: s_movk_i32 s1, 0x7f @@ -1033,13 +1033,13 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1119,13 +1119,13 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1176,14 +1176,14 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1238,15 +1238,15 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: Address32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1348,13 +1348,13 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1401,13 +1401,13 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1450,14 +1450,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1496,15 +1496,15 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: Offset64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1574,13 +1574,13 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1624,13 +1624,13 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1669,14 +1669,14 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -1709,15 +1709,15 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: p32Offset64: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1781,13 +1781,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s42, -1 ; GFX8-NEXT: s_mov_b32 s43, 0xe80000 -; GFX8-NEXT: s_add_u32 s40, s40, s9 +; GFX8-NEXT: s_add_u32 s40, s40, s3 ; GFX8-NEXT: s_addc_u32 s41, s41, 0 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1844,13 +1844,13 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s42, -1 ; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s9 +; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1903,14 +1903,14 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s42, -1 ; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 -; GFX10-NEXT: s_add_u32 s40, s40, s9 +; GFX10-NEXT: s_add_u32 s40, s40, s3 ; GFX10-NEXT: s_addc_u32 s41, s41, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] @@ -1958,15 +1958,15 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; ; GFX11-LABEL: DiffBase: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b128 s[36:39], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 @@ -2051,13 +2051,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2132,13 +2132,13 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2201,14 +2201,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2273,15 +2273,15 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; ; GFX11-LABEL: ReverseOrder: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2387,13 +2387,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s9 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2429,13 +2429,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 -; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2470,14 +2470,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s9 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] @@ -2507,15 +2507,15 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; ; GFX11-LABEL: negativeoffset: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 9a8d5acfbe3e96..5bb260c09c9ddb 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) { ; SDAG-LABEL: buffers_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -18,7 +18,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -50,7 +50,7 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-NEXT: s_mov_b32 s7, 0 ; SDAG-NEXT: s_mov_b32 s6, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -69,7 +69,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_mov_b32 s7, 0 ; GISEL-NEXT: s_mov_b32 s6, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -110,7 +110,7 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -132,7 +132,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac ; ; GISEL-LABEL: buffers_might_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -173,7 +173,7 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; SDAG-LABEL: independent_offsets: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -186,7 +186,7 @@ define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) { ; ; GISEL-LABEL: independent_offsets: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll index 92465420a1ae73..74bad5ea3edce5 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -748,21 +748,21 @@ define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 { define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -800,21 +800,21 @@ define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -852,21 +852,21 @@ define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -904,21 +904,21 @@ define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -956,21 +956,21 @@ define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, f define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 { ; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e32 v0, s4 +; SI-NEXT: v_rcp_f32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e32 v2, s4 +; VI-NEXT: v_rcp_f32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1008,21 +1008,21 @@ define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, |s4| +; SI-NEXT: v_rcp_f32_e64 v0, |s2| +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, |s4| +; VI-NEXT: v_rcp_f32_e64 v2, |s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1061,21 +1061,21 @@ define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float % define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_neg_rcp_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -s4 +; SI-NEXT: v_rcp_f32_e64 v0, -s2 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_neg_rcp_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -s4 +; VI-NEXT: v_rcp_f32_e64 v2, -s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1116,21 +1116,21 @@ define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %s define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_rcp_f32_e64 v0, -|s4| +; SI-NEXT: v_rcp_f32_e64 v0, -|s2| +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s4| +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1173,8 +1173,8 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, fl define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 { ; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1188,13 +1188,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1 ; ; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_rcp_f32_e64 v2, -|s4| +; VI-NEXT: v_rcp_f32_e64 v2, -|s2| ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4| +; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2| ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -1244,7 +1244,7 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; SI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1254,10 +1254,10 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; ; VI-LABEL: s_div_arcp_2_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5 +; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; SI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1309,11 +1309,11 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 ; ; VI-LABEL: s_div_arcp_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s4, v0 +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1355,7 +1355,7 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) ; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s4, s[0:1], 0x0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -1366,11 +1366,11 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) ; ; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[0:1], 0x0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mul_f32_e32 v2, s4, v0 +; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index b1fa85f7c675b7..24e420b7d657bf 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -3212,72 +3212,71 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %i21, ptr addrspace(1) nocapture noundef writeonly align 4 %arg, i32 noundef %arg1) #1 { ; GFX67-LABEL: compute_mad: ; GFX67: ; %bb.0: ; %bb -; GFX67-NEXT: s_load_dword s0, s[2:3], 0x6 -; GFX67-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4 -; GFX67-NEXT: s_mov_b32 s7, 0xf000 +; GFX67-NEXT: s_load_dword s3, s[0:1], 0x6 +; GFX67-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX67-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_add_i32 s0, s0, 1 -; GFX67-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX67-NEXT: v_add_i32_e32 v2, vcc, s0, v1 +; GFX67-NEXT: s_load_dword s6, s[6:7], 0x1 +; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX67-NEXT: s_add_i32 s3, s3, 1 +; GFX67-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) +; GFX67-NEXT: s_and_b32 s6, s6, 0xffff +; GFX67-NEXT: s_mul_i32 s2, s2, s6 +; GFX67-NEXT: v_add_i32_e32 v2, vcc, s3, v1 ; GFX67-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-NEXT: s_load_dword s2, s[10:11], 0x1 -; GFX67-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX67-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_and_b32 s2, s2, 0xffff +; GFX67-NEXT: s_mov_b32 s3, 0xf000 +; GFX67-NEXT: s_mov_b32 s2, 0 ; GFX67-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v3 -; GFX67-NEXT: s_mul_i32 s6, s6, s2 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GFX67-NEXT: s_mov_b32 s6, 0 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX67-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX67-NEXT: v_mov_b32_e32 v2, s1 +; GFX67-NEXT: v_mov_b32_e32 v2, s5 ; GFX67-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX67-NEXT: v_add_i32_e32 v3, vcc, v3, v1 ; GFX67-NEXT: v_mul_lo_u32 v4, v3, v1 -; GFX67-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX67-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX67-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX67-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, v4, v3 -; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 +; GFX67-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: compute_mad: ; GFX8: ; %bb.0: ; %bb -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x18 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_i32 s0, s0, 1 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_add_i32 s3, s3, 1 +; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s3, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x4 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[10:11], 0x4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xffff ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX8-NEXT: s_mul_i32 s2, s2, s1 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s4, 0xffff -; GFX8-NEXT: s_mul_i32 s6, s6, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v3 @@ -3288,104 +3287,102 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX900-LABEL: compute_mad: ; GFX900: ; %bb.0: ; %bb -; GFX900-NEXT: s_load_dword s0, s[2:3], 0x18 +; GFX900-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX900-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x10 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_add_i32 s0, s0, 1 -; GFX900-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX900-NEXT: v_add_u32_e32 v2, s0, v1 +; GFX900-NEXT: s_add_i32 s3, s3, 1 +; GFX900-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, s9 +; GFX900-NEXT: v_add_u32_e32 v2, s3, v1 ; GFX900-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX900-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX900-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[10:11], 0x4 -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX900-NEXT: s_load_dword s3, s[6:7], 0x4 +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX900-NEXT: v_mul_lo_u32 v3, v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v5, s1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_and_b32 s1, s4, 0xffff +; GFX900-NEXT: s_and_b32 s3, s3, 0xffff +; GFX900-NEXT: s_mul_i32 s2, s2, s3 ; GFX900-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX900-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX900-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX900-NEXT: s_mul_i32 s6, s6, s1 -; GFX900-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX900-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, s1 ; GFX900-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v4, s3 ; GFX900-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX900-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v3, v[1:2] -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX900-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v1, v3, v[1:2] +; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX900-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, v1, v[2:3] -; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v3 +; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v1, v[2:3] +; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s8, v3 ; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX900-NEXT: global_store_dword v[1:2], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: compute_mad: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x18 -; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX90A-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x10 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_add_i32 s4, s4, 1 -; GFX90A-NEXT: v_mul_lo_u32 v0, s4, v4 -; GFX90A-NEXT: v_add_u32_e32 v1, s4, v0 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v4 -; GFX90A-NEXT: v_add_u32_e32 v0, 1, v0 -; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX90A-NEXT: v_add_u32_e32 v0, v2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX90A-NEXT: v_add_u32_e32 v1, 1, v2 -; GFX90A-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX90A-NEXT: s_load_dword s7, s[10:11], 0x4 -; GFX90A-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[2:3], v0, v2, v[0:1] -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 -; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3] +; GFX90A-NEXT: s_add_i32 s3, s3, 1 +; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, s3, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX90A-NEXT: v_add_u32_e32 v2, 1, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v2 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: s_load_dword s3, s[6:7], 0x4 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v1 +; GFX90A-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v3, v[2:3] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s4, s7, 0xffff -; GFX90A-NEXT: s_mul_i32 s6, s6, s4 -; GFX90A-NEXT: v_add_u32_e32 v1, s6, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s2, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: s_and_b32 s3, s3, 0xffff +; GFX90A-NEXT: s_mul_i32 s2, s2, s3 +; GFX90A-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v2, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX90A-NEXT: global_store_dword v[2:3], v0, off +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s9 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: global_store_dword v[0:1], v2, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: compute_mad: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x18 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s3, s[0:1], 0x18 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s0, s0, 1 -; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, s0, v1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s3, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX10-NEXT: s_load_dword s4, s[10:11], 0x4 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dword s3, s[6:7], 0x4 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s4, 0xffff +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s6, s4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, s2, s3, v[0:1] ; GFX10-NEXT: v_mul_lo_u32 v1, v3, v2 -; GFX10-NEXT: v_add_co_u32 v2, s2, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s2 +; GFX10-NEXT: v_add_co_u32 v2, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, s5, 0, s2 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, v[1:2] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll index f57e86c68ebf98..0c67f00d7bebf7 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-extended-image-insts.ll @@ -35,6 +35,8 @@ define <4 x float> @needs_extimg(float noundef %0, float noundef %1, <8 x i32> n ; IR: define void @caller( define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 x i32> noundef %3) { + ; EXTIMG: call void @needs_extimg( + ; NOEXTIMG: call void null call void @needs_extimg(float %0, float %1, <8 x i32> %2, <4 x i32> %3) ; IR: ret void ret void @@ -43,6 +45,3 @@ define void @caller(float noundef %0, float noundef %1, <8 x i32> noundef %2, <4 declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) attributes #0 = { "target-features"="+extended-image-insts" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; EXTIMG: {{.*}} -; NOEXTIMG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll index e0b694ee58f0ef..a0380c82d9aaf0 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-functions.ll @@ -115,6 +115,11 @@ @ConstantExpr = internal global i64 ptrtoint (ptr @needs_dpp to i64) define void @needs_dpp(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #0 { +; GFX7-NOT: define void @needs_dpp( +; GFX8: define void @needs_dpp( +; GFX9: define void @needs_dpp( +; GFX10: define void @needs_dpp( +; GFX11: define void @needs_dpp( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -134,6 +139,11 @@ endif: } define void @needs_16bit_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #1 { +; GFX7-NOT: define void @needs_16bit_insts( +; GFX8: define void @needs_16bit_insts( +; GFX9: define void @needs_16bit_insts( +; GFX10: define void @needs_16bit_insts( +; GFX11: define void @needs_16bit_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -153,6 +163,11 @@ endif: } define void @needs_gfx8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #2 { +; GFX7-NOT: define void @needs_gfx8_insts( +; GFX8: define void @needs_gfx8_insts( +; GFX9: define void @needs_gfx8_insts( +; GFX10: define void @needs_gfx8_insts( +; GFX11: define void @needs_gfx8_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -172,6 +187,11 @@ endif: } define void @needs_gfx9_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #3 { +; GFX7-NOT: define void @needs_gfx9_insts( +; GFX8-NOT: define void @needs_gfx9_insts( +; GFX9: define void @needs_gfx9_insts( +; GFX10: define void @needs_gfx9_insts( +; GFX11: define void @needs_gfx9_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -191,6 +211,11 @@ endif: } define void @needs_gfx10_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #4 { +; GFX7-NOT: define void @needs_gfx10_insts( +; GFX8-NOT: define void @needs_gfx10_insts( +; GFX9-NOT: define void @needs_gfx10_insts( +; GFX10: define void @needs_gfx10_insts( +; GFX11: define void @needs_gfx10_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -210,6 +235,11 @@ endif: } define void @needs_gfx11_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #5 { +; GFX7-NOT: define void @needs_gfx11_insts( +; GFX8-NOT: define void @needs_gfx11_insts( +; GFX9-NOT: define void @needs_gfx11_insts( +; GFX10-NOT: define void @needs_gfx11_insts( +; GFX11: define void @needs_gfx11_insts( entry: %cmp = icmp eq i64 %a, 0 br i1 %cmp, label %if, label %else @@ -229,18 +259,34 @@ endif: } define void @needs_dot1_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #6 { +; GFX7-NOT: define void @needs_dot1_insts( +; GFX8-NOT: define void @needs_dot1_insts( +; GFX9: define void @needs_dot1_insts( +; GFX10: define void @needs_dot1_insts( +; GFX11-NOT: define void @needs_dot1_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot2_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #7 { +; GFX7-NOT: define void @needs_dot2_insts( +; GFX8-NOT: define void @needs_dot2_insts( +; GFX9: define void @needs_dot2_insts( +; GFX10: define void @needs_dot2_insts( +; GFX11-NOT: define void @needs_dot2_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #8 { +; GFX7-NOT: define void @needs_dot3_insts( +; GFX8-NOT: define void @needs_dot3_insts( +; GFX906-NOT: define void @needs_dot3_insts( +; GFX90A: define void @needs_dot3_insts( +; GFX10-NOT: define void @needs_dot3_insts( +; GFX11-NOT: define void @needs_dot3_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void @@ -248,30 +294,58 @@ define void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #8 { define void @needs_dot4_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #9 { +; GFX7-NOT: define void @needs_dot4_insts( +; GFX8-NOT: define void @needs_dot4_insts( +; GFX906-NOT: define void @needs_dot4_insts( +; GFX90A: define void @needs_dot4_insts( +; GFX10-NOT: define void @needs_dot4_insts( +; GFX11-NOT: define void @needs_dot4_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot5_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #10 { +; GFX7-NOT: define void @needs_dot5_insts( +; GFX8-NOT: define void @needs_dot5_insts( +; GFX906-NOT: define void @needs_dot5_insts( +; GFX90A: define void @needs_dot5_insts( +; GFX10: define void @needs_dot5_insts( +; GFX11: define void @needs_dot5_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot6_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #11 { +; GFX7-NOT: define void @needs_dot6_insts( +; GFX8-NOT: define void @needs_dot6_insts( +; GFX906-NOT: define void @needs_dot6_insts( +; GFX90A: define void @needs_dot6_insts( +; GFX10: define void @needs_dot6_insts( +; GFX11-NOT: define void @needs_dot6_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot7_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #12 { +; GFX7-NOT: define void @needs_dot7_insts( +; GFX8-NOT: define void @needs_dot7_insts( +; GFX9: define void @needs_dot7_insts( +; GFX10: define void @needs_dot7_insts( +; GFX11: define void @needs_dot7_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void } define void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #13 { +; GFX7-NOT: define void @needs_dot8_insts( +; GFX8-NOT: define void @needs_dot8_insts( +; GFX9-NOT: define void @needs_dot8_insts( +; GFX10-NOT: define void @needs_dot8_insts( +; GFX11: define void @needs_dot8_insts( %add = add i64 %a, %b store i64 %add, ptr %out ret void @@ -279,22 +353,95 @@ define void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) #13 { ; IR: define void @caller( define void @caller(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) { + ; GFX7: call void null( + ; GFX8: call void @needs_dpp( + ; GFX9: call void @needs_dpp( + ; GFX10: call void @needs_dpp( + ; GFX11: call void @needs_dpp( call void @needs_dpp(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void @needs_16bit_insts( + ; GFX9: call void @needs_16bit_insts( + ; GFX10: call void @needs_16bit_insts( + ; GFX11: call void @needs_16bit_insts( call void @needs_16bit_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void @needs_gfx8_insts( + ; GFX9: call void @needs_gfx8_insts( + ; GFX10: call void @needs_gfx8_insts( + ; GFX11: call void @needs_gfx8_insts( call void @needs_gfx8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_gfx9_insts( + ; GFX10: call void @needs_gfx9_insts( ; GFX111: call void @needs_gfx9_insts(c call void @needs_gfx9_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void null( + ; GFX10: call void @needs_gfx10_insts( ; GFX111: call void @needs_gfx10_insts( call void @needs_gfx10_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void null( + ; GFX10: call void null( + ; GFX11: call void @needs_gfx11_insts( call void @needs_gfx11_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_dot1_insts( + ; GFX10: call void @needs_dot1_insts( + ; GFX11: call void null( call void @needs_dot1_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_dot2_insts( + ; GFX10: call void @needs_dot2_insts( + ; GFX11: call void null( call void @needs_dot2_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot3_insts( + ; GFX10: call void null( + ; GFX11: call void null( call void @needs_dot3_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot4_insts( + ; GFX10: call void null( + ; GFX11: call void null( call void @needs_dot4_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot5_insts( + ; GFX10: call void @needs_dot5_insts( + ; GFX11: call void @needs_dot5_insts( call void @needs_dot5_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX906: call void null( + ; GFX90A: call void @needs_dot6_insts( + ; GFX10: call void @needs_dot6_insts( + ; GFX11: call void null( call void @needs_dot6_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void @needs_dot7_insts( + ; GFX10: call void @needs_dot7_insts( + ; GFX11: call void @needs_dot7_insts( call void @needs_dot7_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; GFX7: call void null( + ; GFX8: call void null( + ; GFX9: call void null( + ; GFX10: call void null( + ; GFX11: call void @needs_dot8_insts( call void @needs_dot8_insts(ptr %out, ptr %in, i64 %a, i64 %b, i64 %c) + ; IR: ret void ret void } @@ -312,12 +459,3 @@ attributes #10 = { "target-features"="+dot5-insts" } attributes #11 = { "target-features"="+dot6-insts" } attributes #12 = { "target-features"="+dot7-insts" } attributes #13 = { "target-features"="+dot8-insts" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX7: {{.*}} -; GFX8: {{.*}} -; GFX9: {{.*}} -; GFX906: {{.*}} -; GFX90A: {{.*}} -; IR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll index 2b1e3999a8aa8a..594fad389b6b97 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-gws.ll @@ -38,7 +38,10 @@ define void @needs_gws(i32 %val0, i32 %val1) #0 { ; IR: define void @gws_caller( define void @gws_caller(i32 %val0, i32 %val1) { + ; COMPATIBLE: call void @needs_gws( + ; INCOMPATIBLE: call void null call void @needs_gws(i32 %val0, i32 %val1) + ; IR: ret void ret void } @@ -49,7 +52,3 @@ declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 attributes #0 = { "target-features"="+gws"} attributes #1 = { convergent inaccessiblememonly nounwind } attributes #2 = { convergent inaccessiblememonly nounwind writeonly } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; COMPATIBLE: {{.*}} -; INCOMPATIBLE: {{.*}} -; IR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll index 32fed3ba22c590..2c2401f120cf5e 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll @@ -41,6 +41,8 @@ define i64 @needs_s_memrealtime() #0 { ; IR: define void @s_memrealtime_caller( define i64 @s_memrealtime_caller() { + ; REALTIME: call i64 @needs_s_memrealtime( + ; NOREALTIME: call i64 null %t = call i64 @needs_s_memrealtime() ; IR: ret i64 %t ret i64 %t @@ -55,6 +57,8 @@ define i64 @needs_s_memtime() #1 { ; IR: define void @s_memtime_caller( define i64 @s_memtime_caller() { + ; MEMTIME: call i64 @needs_s_memtime( + ; NOMEMTIME: call i64 null %t = call i64 @needs_s_memtime() ; IR: ret i64 %t ret i64 %t @@ -66,10 +70,3 @@ declare i64 @llvm.amdgcn.s.memtime() attributes #0 = { "target-features"="+s-memrealtime"} attributes #1 = { "target-features"="+s-memtime-inst"} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; COMPATIBLE: {{.*}} -; INCOMPATIBLE: {{.*}} -; MEMTIME: {{.*}} -; NOMEMTIME: {{.*}} -; NOREALTIME: {{.*}} -; REALTIME: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index fdce4431fbbf25..a87973d93ac778 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -21,7 +21,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s3, 32, s3 @@ -35,7 +35,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotl_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -47,7 +47,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s3, 32, s3 @@ -57,7 +57,7 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s3, 32, s3 @@ -95,8 +95,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, 32, s6 ; GFX8-NEXT: s_sub_i32 s3, 32, s7 @@ -128,22 +128,22 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s7 -; GFX10-NEXT: s_sub_i32 s3, 32, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_sub_i32 s0, 32, s7 +; GFX10-NEXT: s_sub_i32 s1, 32, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s0 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s7 @@ -188,8 +188,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotl_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -210,8 +210,8 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotl_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s3, 32, s9 ; GFX8-NEXT: s_sub_i32 s9, 32, s11 @@ -233,26 +233,26 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotl_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s8 -; GFX10-NEXT: s_sub_i32 s3, 32, s9 +; GFX10-NEXT: s_sub_i32 s0, 32, s8 +; GFX10-NEXT: s_sub_i32 s1, 32, s9 ; GFX10-NEXT: s_sub_i32 s8, 32, s11 ; GFX10-NEXT: s_sub_i32 s9, 32, s10 ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s8 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s9 -; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s2 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s0 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s2, 32, s8 diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 0e1dd69d930ae5..058ee589bc4b09 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; SI-LABEL: rotr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotr_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 @@ -43,7 +43,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -52,7 +52,7 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 @@ -84,8 +84,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -98,8 +98,8 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 @@ -113,20 +113,20 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s7 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s6 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7 @@ -161,8 +161,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; SI-LABEL: rotr_v4i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -179,8 +179,8 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; ; GFX8-LABEL: rotr_v4i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 @@ -198,22 +198,22 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: rotr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v3, s7, s7, s11 ; GFX10-NEXT: v_alignbit_b32 v2, s6, s6, s10 ; GFX10-NEXT: v_alignbit_b32 v1, s5, s5, s9 ; GFX10-NEXT: v_alignbit_b32 v0, s4, s4, s8 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v4i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index 40a8592dba6df0..846fbdb33d668d 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -20,7 +20,7 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -38,7 +38,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -56,7 +56,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; GCN-DAZ-SAFE-LABEL: rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -134,7 +134,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -198,39 +198,39 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) { ; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 +; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; ; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-IEEE-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s4 +; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v0, s2 +; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-IEEE-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IEEE-UNSAFE-NEXT: s_endpgm ; ; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb +; GCN-DAZ-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1 ; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 @@ -245,21 +245,20 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) ; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-DAZ-SAFE-NEXT: s_endpgm ; ; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 @@ -289,15 +288,15 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; ; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[2:3], 0xb -; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dword s2, s[0:1], 0xb +; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1 -; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 +; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s2, v1 +; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s2 +; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s2, v0 ; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] ; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 @@ -367,7 +366,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-UNSAFE-NEXT: s_endpgm ; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-DAZ-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -391,7 +390,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, 0 ; GCN-IEEE-UNSAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -415,7 +414,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-DAZ-SAFE-LABEL: rsqrt_fmul: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -466,7 +465,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-IEEE-SAFE-LABEL: rsqrt_fmul: ; GCN-IEEE-SAFE: ; %bb.0: -; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0 ; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -533,7 +532,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -552,7 +551,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -571,7 +570,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -606,7 +605,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -649,7 +648,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -714,7 +713,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-UNSAFE: ; %bb.0: -; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -733,7 +732,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-IEEE-UNSAFE: ; %bb.0: -; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IEEE-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s6, -1 ; GCN-IEEE-UNSAFE-NEXT: s_mov_b32 s10, s6 @@ -752,7 +751,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32: ; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 ; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 @@ -787,7 +786,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 @@ -830,7 +829,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll index 78ea3b3699f2a5..0b58b950505244 100644 --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -10,7 +10,7 @@ ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[VRESULT]] ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, 65 store i32 %add, ptr addrspace(1) %out ret void @@ -20,7 +20,7 @@ define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %a, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %a, i32 %b) { %add0 = add i32 %a, 65 %add1 = add i32 %b, 65 store i32 %add0, ptr addrspace(1) %out0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspac ; SI-LABEL: {{^}}s_addk_i32_k1: ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}} ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, 32767 ; (1 << 15) - 1 store i32 %add, ptr addrspace(1) %out ret void @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) #0 { ; SI-LABEL: {{^}}s_addk_i32_k2: ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, -17 store i32 %add, ptr addrspace(1) %out ret void @@ -49,7 +49,7 @@ define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) #0 { ; SI-LABEL: {{^}}s_addk_i32_k3: ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}} ; SI: s_endpgm -define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, -65 store i32 %add, ptr addrspace(1) %out ret void @@ -60,7 +60,7 @@ define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) #0 { ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI: s_endpgm ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b -define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) #0 { +define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) { %add = add <2 x i32> %b, store <2 x i32> %add, ptr addrspace(1) %out ret void @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) #0 { +define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) { %add = add <4 x i32> %b, store <4 x i32> %add, ptr addrspace(1) %out ret void @@ -88,7 +88,7 @@ define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) #0 { +define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) { %add = add <8 x i32> %b, store <8 x i32> %add, ptr addrspace(1) %out ret void @@ -97,7 +97,7 @@ define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) ; SI-LABEL: {{^}}no_s_addk_i32_k0: ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}} ; SI: s_endpgm -define amdgpu_kernel void @no_s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) #0 { +define amdgpu_kernel void @no_s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) { %add = add i32 %b, 32768 ; 1 << 15 store i32 %add, ptr addrspace(1) %out ret void @@ -116,5 +116,5 @@ define amdgpu_kernel void @commute_s_addk_i32(ptr addrspace(1) %out, i32 %b) #0 declare i32 @llvm.amdgcn.groupstaticsize() #1 -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 35a5210d1c790b..0492c5663e6660 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -30,8 +30,8 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) { ; GCN-LABEL: v_sad_u32_constant_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 @@ -55,8 +55,8 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -79,12 +79,12 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: s_min_u32 s0, s0, s1 @@ -93,7 +93,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -115,19 +115,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_add_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -147,19 +147,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_max_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -182,19 +182,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_min_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -218,19 +218,19 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -251,12 +251,12 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_select_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s13 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: s_mov_b64 s[8:9], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, s7 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: s_sub_i32 s6, s1, s0 @@ -266,7 +266,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[8:11], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -286,9 +286,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -322,9 +322,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0xc -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -356,11 +356,11 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) { ; GCN-LABEL: v_sad_u32_i16_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NEXT: s_and_b32 s4, s6, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s0 @@ -387,7 +387,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: flat_load_ushort v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ushort v2, v[0:1] glc @@ -415,8 +415,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) { ; GCN-LABEL: v_sad_u32_i8_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -446,7 +446,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: flat_load_ubyte v1, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_load_ubyte v2, v[0:1] glc @@ -474,8 +474,8 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { ; GCN-LABEL: s_sad_u32_i8_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s4, s2, 8 ; GCN-NEXT: s_and_b32 s3, s2, 0xff @@ -505,8 +505,8 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -534,8 +534,8 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { ; GCN-LABEL: v_sad_u32_mismatched_operands_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x2 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 684279a3776fc5..bd3c422b52efcd 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -15,8 +15,8 @@ declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ; SI-LABEL: saddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -38,8 +38,8 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: saddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_add_u32 s2, s6, s0 @@ -59,20 +59,20 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; GFX9-LABEL: saddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s3, s7, s1 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -80,26 +80,26 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX10-LABEL: saddo_i64_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s6, s0 -; GFX10-NEXT: s_addc_u32 s3, s7, s1 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] -; GFX10-NEXT: s_xor_b32 s0, s0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_add_u32 s0, s6, s2 +; GFX10-NEXT: s_addc_u32 s1, s7, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7] +; GFX10-NEXT: s_xor_b32 s2, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: saddo_i64_zext: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s6, s0 @@ -128,34 +128,34 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_add_i32 s14, s12, s13 -; SI-NEXT: s_cmp_lt_i32 s13, 0 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s14, s12 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_add_i32 s12, s8, s9 +; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s12, s8 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_add_i32 s4, s0, s1 @@ -175,15 +175,15 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_add_i32 s0, s2, s3 +; GFX9-NEXT: v_add_i32 v1, s2, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] @@ -192,12 +192,12 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-LABEL: s_saddo_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_add_nc_i32 v0, s2, s3 clamp +; GFX10-NEXT: s_add_i32 s0, s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo @@ -208,8 +208,8 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-LABEL: s_saddo_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp ; GFX11-NEXT: s_add_i32 s4, s4, s5 @@ -234,7 +234,7 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -288,7 +288,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -304,7 +304,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_saddo_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -321,7 +321,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_saddo_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -352,7 +352,7 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind { ; SI-LABEL: s_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -379,7 +379,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s8, s4, s6 @@ -420,7 +420,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: s_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s8, s4, s6 @@ -437,7 +437,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: s_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s8, s4, s6 ; GFX11-NEXT: s_addc_u32 s9, s5, s7 @@ -465,7 +465,7 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -496,7 +496,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_saddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -521,7 +521,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] @@ -539,7 +539,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_saddo_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -558,7 +558,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -592,7 +592,7 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -627,7 +627,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_saddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -656,7 +656,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5] @@ -676,7 +676,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: v_saddo_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 @@ -697,7 +697,7 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: v_saddo_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 1700ce302cc9db..5260a4847f70d4 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -53,7 +53,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: scalar_to_vector_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -73,7 +73,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: scalar_to_vector_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -219,8 +219,8 @@ bb: define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zeroext %val) nounwind { ; SI-LABEL: scalar_to_vector_test6: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -230,8 +230,8 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero ; ; VI-LABEL: scalar_to_vector_test6: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 89a09dc4fcc171..baee88b69d0602 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8i16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX906-LABEL: scalar_to_vector_v8i16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -37,7 +37,7 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX908-LABEL: scalar_to_vector_v8i16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -52,9 +52,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; ; GFX90A-LABEL: scalar_to_vector_v8i16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -86,7 +85,7 @@ entry: define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8f16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 @@ -101,7 +100,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX906-LABEL: scalar_to_vector_v8f16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 @@ -116,7 +115,7 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX908-LABEL: scalar_to_vector_v8f16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 @@ -131,9 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; ; GFX90A-LABEL: scalar_to_vector_v8f16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 7f8240eeb98ebf..0fe7666322949f 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x8 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 ; CHECK-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index b57a51f1382aec..6372d74161fad7 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -60,7 +60,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: sdiv_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -104,7 +104,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -199,7 +199,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -220,7 +220,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -241,7 +241,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -293,7 +293,7 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: slow_sdiv_i32_3435: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -316,7 +316,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; TONGA-LABEL: slow_sdiv_i32_3435: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa ; ; GFX9-LABEL: slow_sdiv_i32_3435: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -391,7 +391,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -462,7 +462,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -533,7 +533,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -682,7 +682,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -707,7 +707,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -732,7 +732,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v2i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -791,7 +791,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, s10 @@ -918,7 +918,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s11, 0xf000 ; TONGA-NEXT: s_mov_b32 s10, -1 ; TONGA-NEXT: s_mov_b32 s6, s10 @@ -1045,7 +1045,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: sdiv_v4i32_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1482,7 +1482,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; TONGA-LABEL: v_sdiv_i8: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_mov_b32 s10, s6 @@ -1515,7 +1515,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: v_sdiv_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_mov_b32 s10, s6 @@ -1594,7 +1594,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1637,7 +1637,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1680,7 +1680,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1783,7 +1783,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -1824,7 +1824,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -1865,7 +1865,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -1962,7 +1962,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i25: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s10, s2 @@ -2009,7 +2009,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i25: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 ; TONGA-NEXT: s_mov_b32 s10, s2 @@ -2056,7 +2056,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s10, s2 @@ -2189,7 +2189,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; GCN-LABEL: scalarize_mulhs_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2221,7 +2221,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; TONGA-LABEL: scalarize_mulhs_4xi32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s7, 0xf000 ; TONGA-NEXT: s_mov_b32 s6, -1 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read ; ; GFX9-LABEL: scalarize_mulhs_4xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index f4776747f16ac1..c310e257adadc6 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -5,20 +5,20 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s1, 31 -; GCN-NEXT: s_add_u32 s0, s0, s8 +; GCN-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s8 ; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: s_addc_u32 s1, s1, s8 -; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] +; GCN-NEXT: s_addc_u32 s3, s3, s8 +; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: s_sub_u32 s4, 0, s10 ; GCN-NEXT: s_subb_u32 s5, 0, s11 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -140,8 +140,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_sdiv: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 @@ -460,8 +460,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -490,8 +490,8 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -587,8 +587,8 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -628,8 +628,8 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -676,14 +676,14 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -720,14 +720,14 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 33 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -771,8 +771,8 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -801,8 +801,8 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -838,14 +838,14 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 ; GCN-NEXT: s_abs_i32 s9, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -882,14 +882,14 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 39 ; GCN-IR-NEXT: s_abs_i32 s9, s8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -933,94 +933,94 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_sdiv24_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_xor_b32 s1, s8, s2 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v2 +; GCN-NEXT: s_or_b32 s7, s4, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s7, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s10 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-NEXT: s_xor_b32 s4, s6, s10 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[12:13], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[10:11], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_xor_b32 s1, s8, s2 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[14:15], 40 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| -; GCN-IR-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-IR-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s1, v2 +; GCN-IR-NEXT: s_or_b32 s7, s4, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s7, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GCN-IR-NEXT: s_xor_b32 s0, s0, s10 -; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GCN-IR-NEXT: s_xor_b32 s4, s6, s10 +; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr <2 x i64> %x, %2 = ashr <2 x i64> %y, @@ -1032,18 +1032,20 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_sdiv24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s2, s7 -; GCN-NEXT: s_sext_i32_i16 s1, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_sext_i32_i16 s5, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_sext_i32_i16 s4, s7 ; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_alignbit_b32 v2, s2, v2, 24 +; GCN-NEXT: v_alignbit_b32 v2, s4, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 @@ -1055,35 +1057,33 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GCN-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 ; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 -; GCN-IR-NEXT: s_mov_b32 s1, s0 +; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 ; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GCN-IR-NEXT: s_mov_b32 s5, s4 -; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 -; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 +; GCN-IR-NEXT: s_sub_u32 s12, s6, s2 +; GCN-IR-NEXT: s_subb_u32 s13, s7, s2 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 @@ -1146,8 +1146,8 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_sdiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 @@ -1853,7 +1853,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1939,7 +1939,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_sdiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 669ed915a002ae..911bb44078d510 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: add_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -22,7 +22,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: add_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -36,7 +36,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: add_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -47,7 +47,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: add_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -65,7 +65,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: sub_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -80,7 +80,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: sub_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -94,7 +94,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: sub_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -105,7 +105,7 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: sub_shr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -123,8 +123,8 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { ; NOSDWA-LABEL: mul_shr_i32: ; NOSDWA: ; %bb.0: -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -147,8 +147,8 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX89-LABEL: mul_shr_i32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -168,12 +168,12 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: mul_shr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -183,13 +183,13 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-LABEL: mul_shr_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -210,8 +210,8 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -231,8 +231,8 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -252,12 +252,12 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 @@ -267,13 +267,13 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; GFX10-LABEL: mul_i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 @@ -293,8 +293,8 @@ entry: define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -319,8 +319,8 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -342,12 +342,12 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -357,13 +357,13 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -383,8 +383,8 @@ entry: define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -415,8 +415,8 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v4i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -441,12 +441,12 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v4i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -457,13 +457,13 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v4i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v3 @@ -484,8 +484,8 @@ entry: define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -528,8 +528,8 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX89-LABEL: mul_v8i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -560,12 +560,12 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX9-LABEL: mul_v8i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -578,13 +578,13 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX10-LABEL: mul_v8i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v3, v3, v7 @@ -607,8 +607,8 @@ entry: define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -625,8 +625,8 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -643,12 +643,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] @@ -657,13 +657,13 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] @@ -679,8 +679,8 @@ entry: define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -703,8 +703,8 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -723,12 +723,12 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -737,13 +737,13 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -759,8 +759,8 @@ entry: define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -789,8 +789,8 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v4half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -812,12 +812,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v4half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 @@ -827,13 +827,13 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v4half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 @@ -850,8 +850,8 @@ entry: define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v4, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 @@ -892,8 +892,8 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mul_v8half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -921,12 +921,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mul_v8half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 @@ -938,13 +938,13 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mul_v8half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 @@ -963,8 +963,8 @@ entry: define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v2, s7 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0 @@ -983,8 +983,8 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX89-LABEL: mul_i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v2, s7 ; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0 @@ -1003,11 +1003,11 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; ; GFX9-LABEL: mul_i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 @@ -1017,12 +1017,12 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ; GFX10-LABEL: mul_i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] -; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 @@ -1042,8 +1042,8 @@ entry: define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v2i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1070,8 +1070,8 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v2i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1093,12 +1093,12 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v2i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 @@ -1110,13 +1110,13 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v2i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b16 v0, 8, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1142,8 +1142,8 @@ entry: define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v4i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1182,8 +1182,8 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v4i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1209,12 +1209,12 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v4i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 @@ -1230,13 +1230,13 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v4i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1 @@ -1271,8 +1271,8 @@ entry: define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mul_v8i8: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1330,8 +1330,8 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX89-LABEL: mul_v8i8: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1364,12 +1364,12 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; ; GFX9-LABEL: mul_v8i8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v1, v3 @@ -1392,13 +1392,13 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX10-LABEL: mul_v8i8: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b16 v6, 8, v0 @@ -1449,7 +1449,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; NOSDWA-LABEL: sitofp_v2i16_to_v2f16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1467,7 +1467,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX89-LABEL: sitofp_v2i16_to_v2f16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s2 ; GFX89-NEXT: v_mov_b32_e32 v1, s3 @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX9-LABEL: sitofp_v2i16_to_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1496,7 +1496,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX10-LABEL: sitofp_v2i16_to_v2f16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1519,8 +1519,8 @@ entry: define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -1543,8 +1543,8 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX89-LABEL: mac_v2half: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 @@ -1566,12 +1566,12 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX9-LABEL: mac_v2half: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 @@ -1581,13 +1581,13 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-LABEL: mac_v2half: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 ; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 @@ -1605,7 +1605,7 @@ entry: define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; NOSDWA-LABEL: immediate_mul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX89-LABEL: immediate_mul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX89-NEXT: v_mov_b32_e32 v3, 0x141 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -1644,7 +1644,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: immediate_mul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1657,7 +1657,7 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: immediate_mul_v2i16: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1679,8 +1679,8 @@ entry: define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mulmul_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1708,8 +1708,8 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX89-LABEL: mulmul_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1733,12 +1733,12 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: mulmul_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -1749,13 +1749,13 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: mulmul_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1777,8 +1777,8 @@ entry: define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: add_bb_v2i16: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 @@ -1802,8 +1802,8 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX89-LABEL: add_bb_v2i16: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -1822,12 +1822,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: add_bb_v2i16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -1836,13 +1836,13 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-LABEL: add_bb_v2i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] @@ -1863,7 +1863,7 @@ store_label: define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 { ; NOSDWA-LABEL: pulled_out_test: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 @@ -1900,7 +1900,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX89-LABEL: pulled_out_test: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: v_mov_b32_e32 v4, 8 ; GFX89-NEXT: v_mov_b32_e32 v5, 0xff ; GFX89-NEXT: s_waitcnt lgkmcnt(0) @@ -1929,7 +1929,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX9-LABEL: pulled_out_test: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1955,7 +1955,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp ; ; GFX10-LABEL: pulled_out_test: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 @@ -2198,8 +2198,8 @@ bb2: define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { ; NOSDWA-LABEL: mac_v2half_same_srcop: ; NOSDWA: ; %bb.0: ; %entry -; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 @@ -2222,8 +2222,8 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX89-LABEL: mac_v2half_same_srcop: ; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX89-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 @@ -2245,11 +2245,11 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; ; GFX9-LABEL: mac_v2half_same_srcop: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 @@ -2261,12 +2261,12 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr ; GFX10-LABEL: mac_v2half_same_srcop: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll index f11e86aef683d1..d807c3909e656e 100644 --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GCN-LABEL: select_constant_cttz: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 01c4031f3cf613..cf9ccc575385c8 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -6,35 +6,35 @@ define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s14 -; SI-NEXT: s_mov_b32 s23, s15 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -42,50 +42,50 @@ define amdgpu_kernel void @select_f16( ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s14 -; VI-NEXT: s_mov_b32 s23, s15 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: s_mov_b32 s11, s15 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[12:15], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-NEXT: s_mov_b32 s14, -1 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s14 @@ -139,7 +139,7 @@ entry: define amdgpu_kernel void @select_f16_imm_a( ; SI-LABEL: select_f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -174,7 +174,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; VI-LABEL: select_f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -205,7 +205,7 @@ define amdgpu_kernel void @select_f16_imm_a( ; ; GFX11-LABEL: select_f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -252,7 +252,7 @@ entry: define amdgpu_kernel void @select_f16_imm_b( ; SI-LABEL: select_f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -287,7 +287,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; VI-LABEL: select_f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -318,7 +318,7 @@ define amdgpu_kernel void @select_f16_imm_b( ; ; GFX11-LABEL: select_f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -365,7 +365,7 @@ entry: define amdgpu_kernel void @select_f16_imm_c( ; SI-LABEL: select_f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -400,7 +400,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; VI-LABEL: select_f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -432,7 +432,7 @@ define amdgpu_kernel void @select_f16_imm_c( ; ; GFX11-LABEL: select_f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -479,7 +479,7 @@ entry: define amdgpu_kernel void @select_f16_imm_d( ; SI-LABEL: select_f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -514,7 +514,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; VI-LABEL: select_f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -546,7 +546,7 @@ define amdgpu_kernel void @select_f16_imm_d( ; ; GFX11-LABEL: select_f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -593,31 +593,31 @@ entry: define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s14 -; SI-NEXT: s_mov_b32 s23, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -642,36 +642,36 @@ define amdgpu_kernel void @select_v2f16( ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 -; VI-NEXT: s_mov_b32 s18, s14 -; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s22, s14 -; VI-NEXT: s_mov_b32 s23, s15 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 -; VI-NEXT: s_mov_b32 s11, s15 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -686,14 +686,14 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: select_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[12:13], s[2:3], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s2 @@ -754,7 +754,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_a( ; SI-LABEL: select_v2f16_imm_a: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -801,7 +801,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; VI-LABEL: select_v2f16_imm_a: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -839,7 +839,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; ; GFX11-LABEL: select_v2f16_imm_a: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -895,7 +895,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_b( ; SI-LABEL: select_v2f16_imm_b: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -942,7 +942,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; VI-LABEL: select_v2f16_imm_b: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 @@ -980,7 +980,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; ; GFX11-LABEL: select_v2f16_imm_b: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -1036,7 +1036,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_c( ; SI-LABEL: select_v2f16_imm_c: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1083,7 +1083,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; VI-LABEL: select_v2f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; GFX11-LABEL: select_v2f16_imm_c: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 @@ -1179,7 +1179,7 @@ entry: define amdgpu_kernel void @select_v2f16_imm_d( ; SI-LABEL: select_v2f16_imm_d: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -1226,7 +1226,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; VI-LABEL: select_v2f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; GFX11-LABEL: select_v2f16_imm_d: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s10 diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll index cc82f532fc4779..c00cd763992d97 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc.ll @@ -463,4 +463,4 @@ entry: ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll index 31a802b7428b95..2169ee117cbaaf 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,9 +25,9 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -47,8 +47,8 @@ define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -68,8 +68,8 @@ define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: sext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -89,9 +89,9 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -111,8 +111,8 @@ define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a define amdgpu_kernel void @sext_i32_to_i64_divergent(ptr addrspace(1) %out, i32 %a, i64 %b) { ; GCN-LABEL: sext_i32_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index 0630cca7c099b8..b67ecc2f9d13c8 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,18 +11,18 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dword s0, s[2:3], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s7, s7, s0 +; SI-NEXT: s_add_i32 s7, s7, s2 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if ; SI-NEXT: s_sub_i32 s7, s5, s6 ; SI-NEXT: .LBB0_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_add_i32 s4, s7, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -56,23 +56,23 @@ endif: define amdgpu_kernel void @sgpr_if_else_salu_br_opt(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s0, s[2:3], 0x2e -; SI-NEXT: s_load_dword s1, s[2:3], 0x37 +; SI-NEXT: s_load_dword s2, s[0:1], 0x2e +; SI-NEXT: s_load_dword s3, s[0:1], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s0, s1 +; SI-NEXT: s_add_i32 s5, s2, s3 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %if -; SI-NEXT: s_load_dword s0, s[2:3], 0x1c -; SI-NEXT: s_load_dword s1, s[2:3], 0x25 +; SI-NEXT: s_load_dword s2, s[0:1], 0x1c +; SI-NEXT: s_load_dword s3, s[0:1], 0x25 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s5, s0, s1 +; SI-NEXT: s_add_i32 s5, s2, s3 ; SI-NEXT: .LBB1_3: ; %endif -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_add_i32 s4, s5, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -108,28 +108,28 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_br(ptr addrspace(1) %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xc +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xc ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s8, s6, s7 ; SI-NEXT: .LBB2_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_xor_b64 exec, exec, s[0:1] +; SI-NEXT: s_xor_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB2_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB2_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -158,8 +158,8 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll index 3d8807a88a46c1..8abd4b4302f547 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3 target triple = "amdgcn-amd-amdhsa" ; CHECK-LABEL: {{^}}t0: -; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[6:7], 0x0 +; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] ; There should be no redundant copies from PTR_HI. ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 37cf76103aa945..21fcd3cd0dcd61 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -164,4 +164,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 6de015c6de79b2..bdc607552a0dfb 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -391,7 +391,8 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr add } declare i32 @llvm.amdgcn.workitem.id.x() #0 + declare i32 @llvm.amdgcn.workgroup.id.x() #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index ebc916b5c889b5..4b02d00ddce1ef 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -181,7 +181,7 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -214,7 +214,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -247,7 +247,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -430,7 +430,7 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -502,7 +502,7 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 @@ -574,7 +574,7 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 47ab5ba666877a..b3f4790df4d485 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workgroup.id.x() #0 define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -29,7 +29,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -70,7 +70,7 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -92,7 +92,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -140,7 +140,7 @@ define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -159,7 +159,7 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -214,40 +214,40 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s12, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_s: @@ -287,42 +287,42 @@ define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { ; SI-LABEL: shl_i16_v_compute_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s12, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_i16_v_compute_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s12, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_v_compute_s: @@ -370,7 +370,7 @@ define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i16_computed_amount: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -396,7 +396,7 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: shl_i16_computed_amount: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 @@ -472,8 +472,8 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { ; SI-LABEL: shl_i16_i_s: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -484,8 +484,8 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { ; ; VI-LABEL: shl_i16_i_s: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -530,7 +530,7 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -561,7 +561,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -630,7 +630,7 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -659,7 +659,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -752,7 +752,7 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -770,7 +770,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: shl_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -819,7 +819,7 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -839,7 +839,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -903,7 +903,7 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: shl_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -929,7 +929,7 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: shl_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -1029,8 +1029,8 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -1041,8 +1041,8 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a ; ; VI-LABEL: s_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -1070,34 +1070,34 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_shl_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; SI-NEXT: s_ashr_i32 s7, s6, 31 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_ashr_i32 s3, s2, 31 +; SI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_shl_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; VI-NEXT: s_ashr_i32 s7, s6, 31 -; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_ashr_i32 s3, s2, 31 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, s4 -; VI-NEXT: s_addc_u32 s3, s3, s5 +; VI-NEXT: s_add_u32 s2, s6, s0 +; VI-NEXT: s_addc_u32 s3, s7, s1 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_add_u32 s0, s0, s4 -; VI-NEXT: s_addc_u32 s1, s1, s5 +; VI-NEXT: s_add_u32 s0, s4, s0 +; VI-NEXT: s_addc_u32 s1, s5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1137,7 +1137,7 @@ define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: s_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s9, 0xffff ; SI-NEXT: s_mov_b32 s8, s6 @@ -1153,7 +1153,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: s_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s9, 0xffff ; VI-NEXT: s_mov_b32 s8, s6 @@ -1195,7 +1195,7 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_constant_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1215,7 +1215,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: v_shl_constant_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1266,7 +1266,7 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_i64_32_bit_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1285,7 +1285,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_shl_i64_32_bit_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1331,7 +1331,7 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { ; SI-LABEL: v_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1349,7 +1349,7 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -1394,8 +1394,8 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_64_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,8 +1407,8 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: s_shl_inline_imm_64_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1444,8 +1444,8 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1457,8 +1457,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: s_shl_inline_imm_1_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1495,8 +1495,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1508,8 +1508,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1542,8 +1542,8 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1555,8 +1555,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_1_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1589,8 +1589,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1602,8 +1602,8 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,8 +1636,8 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1649,8 +1649,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_0_5_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1683,8 +1683,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1696,8 +1696,8 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1730,8 +1730,8 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1743,8 +1743,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_2_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1777,8 +1777,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1790,8 +1790,8 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a ; ; VI-LABEL: s_shl_inline_imm_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1824,8 +1824,8 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1837,8 +1837,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1874,8 +1874,8 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1887,8 +1887,8 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p ; ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1926,32 +1926,32 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s4, -4.0 -; SI-NEXT: s_mov_b32 s5, -1 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s0, -4.0 +; SI-NEXT: s_mov_b32 s1, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s4, -4.0 -; VI-NEXT: s_mov_b32 s5, -1 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s0, -4.0 +; VI-NEXT: s_mov_b32 s1, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: @@ -1982,32 +1982,32 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s5, 4.0 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 4.0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s5, 4.0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, 4.0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64: @@ -2033,32 +2033,32 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %o define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { ; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s6, s[2:3], 0xd -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s5, -4.0 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, -4.0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s6, s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s5, -4.0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, -4.0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1 define amdgpu_kernel void @test_mul2(i32 %p) { ; SI-LABEL: test_mul2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2096,7 +2096,7 @@ define amdgpu_kernel void @test_mul2(i32 %p) { ; ; VI-LABEL: test_mul2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 8c663d963b73e3..b81af3eb838f1f 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -21,7 +21,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -40,7 +40,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -59,7 +59,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -71,7 +71,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -90,7 +90,7 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -101,7 +101,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -142,7 +142,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -153,9 +153,7 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -180,20 +178,20 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_v_s_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -214,21 +212,21 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s0, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -237,10 +235,9 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_v_s_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -250,12 +247,9 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_v_s_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -277,20 +271,20 @@ define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { ; GFX9-LABEL: shl_s_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -311,21 +305,21 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; CI-LABEL: shl_s_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; CI-NEXT: s_load_dword s0, s[2:3], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 -; CI-NEXT: s_lshr_b32 s1, s0, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_lshr_b32 s0, s8, 16 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s0, v2 -; CI-NEXT: v_lshl_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshl_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s0, v3 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -334,10 +328,9 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX10-LABEL: shl_s_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -347,12 +340,9 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX11-LABEL: shl_s_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -374,7 +364,7 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -385,7 +375,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -405,7 +395,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -426,7 +416,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_imm_v_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -437,9 +427,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_imm_v_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -462,7 +450,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -473,7 +461,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -493,7 +481,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -510,7 +498,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -521,9 +509,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -546,7 +532,7 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -558,7 +544,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -580,7 +566,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 @@ -609,7 +595,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX10-LABEL: v_shl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -621,9 +607,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX11-LABEL: v_shl_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -649,7 +633,7 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX9-LABEL: shl_v_imm_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -661,7 +645,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; VI-LABEL: shl_v_imm_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -685,7 +669,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -708,7 +692,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: shl_v_imm_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -720,9 +704,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: shl_v_imm_v4i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index ddf331816694ad..c5fc51091704b5 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -16,7 +16,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -32,7 +32,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -48,7 +48,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -65,7 +65,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -84,7 +84,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -95,7 +95,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -106,9 +106,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -131,7 +129,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -153,7 +151,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -175,7 +173,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -198,7 +196,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -223,7 +221,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc @@ -240,7 +238,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -257,9 +255,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -291,7 +287,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_64_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -307,7 +303,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_64_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -323,7 +319,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_64_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -340,7 +336,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_64_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -359,7 +355,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_64_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -370,7 +366,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_64_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -381,9 +377,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_64_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -406,7 +400,7 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_65: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -422,7 +416,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_x_sub_65: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -438,7 +432,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_x_sub_65: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -455,7 +449,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_x_sub_65: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -474,7 +468,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -485,7 +479,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -496,7 +490,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -507,7 +501,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -518,9 +512,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_65: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -533,9 +525,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_65: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -558,7 +548,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_65_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -574,7 +564,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i32_65_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -590,7 +580,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i32_65_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -607,7 +597,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i32_65_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -626,7 +616,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i32_65_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -637,7 +627,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i32_65_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -648,9 +638,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i32_65_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -673,7 +661,7 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -689,7 +677,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -705,7 +693,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -722,7 +710,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -741,7 +729,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -752,7 +740,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -763,7 +751,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -774,7 +762,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -785,9 +773,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -800,9 +786,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -825,7 +809,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -841,7 +825,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -857,7 +841,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg16_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -874,7 +858,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg16_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -893,7 +877,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -904,7 +888,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_i32_neg16_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -915,9 +899,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_i32_neg16_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -940,7 +922,7 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -956,7 +938,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -972,7 +954,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_x_sub_neg17: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -989,7 +971,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_x_sub_neg17: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1008,7 +990,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1019,7 +1001,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1030,7 +1012,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1041,7 +1023,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -1052,9 +1034,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1067,9 +1047,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; ; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1092,7 +1070,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1108,7 +1086,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1124,7 +1102,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_i32_neg17_sub_x: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1141,7 +1119,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_i32_neg17_sub_x: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1160,7 +1138,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1171,7 +1149,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_i32_neg17_sub_x: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1182,9 +1160,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_i32_neg17_sub_x: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1207,7 +1183,7 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; SI-LABEL: s_test_i32_x_sub_64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_i32 s0, s0, 64 ; SI-NEXT: ;;#ASMSTART @@ -1217,7 +1193,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; VI-LABEL: s_test_i32_x_sub_64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sub_i32 s0, s0, 64 ; VI-NEXT: ;;#ASMSTART @@ -1227,7 +1203,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX9-LABEL: s_test_i32_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s0, s0, 64 ; GFX9-NEXT: ;;#ASMSTART @@ -1237,7 +1213,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX10-LABEL: s_test_i32_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sub_i32 s0, s0, 64 ; GFX10-NEXT: ;;#ASMSTART @@ -1247,7 +1223,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { ; ; GFX11-LABEL: s_test_i32_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_i32 s0, s0, 64 ; GFX11-NEXT: ;;#ASMSTART @@ -1262,7 +1238,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 { define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1278,7 +1254,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1294,7 +1270,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1311,7 +1287,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1330,7 +1306,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX9-LABEL: v_test_i16_x_sub_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1341,7 +1317,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX10-LABEL: v_test_i16_x_sub_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1352,9 +1328,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; ; GFX11-LABEL: v_test_i16_x_sub_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] @@ -1377,7 +1351,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 @@ -1395,7 +1369,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1413,7 +1387,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1431,7 +1405,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2 @@ -1451,7 +1425,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1463,7 +1437,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1476,9 +1450,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1505,7 +1477,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -1527,7 +1499,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1549,7 +1521,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -1572,7 +1544,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -1597,7 +1569,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -1614,7 +1586,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc @@ -1631,9 +1603,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc @@ -1665,7 +1635,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1684,7 +1654,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1706,7 +1676,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1726,7 +1696,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1748,7 +1718,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -1759,7 +1729,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1770,9 +1740,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1795,7 +1763,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1814,7 +1782,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1836,7 +1804,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1856,7 +1824,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1878,7 +1846,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -1890,7 +1858,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -1902,7 +1870,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -1913,9 +1881,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -1938,7 +1904,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1957,7 +1923,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -1979,7 +1945,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -1999,7 +1965,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2021,7 +1987,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2033,7 +1999,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2045,7 +2011,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2056,9 +2022,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_sub_64_123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2082,7 +2046,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2100,7 +2064,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2120,7 +2084,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2139,7 +2103,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2161,7 +2125,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2172,7 +2136,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2183,9 +2147,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; ; GFX11-LABEL: v_test_v2i16_x_sub_7_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2209,7 +2171,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2225,7 +2187,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2246,7 +2208,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2265,7 +2227,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2286,7 +2248,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2297,7 +2259,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2308,9 +2270,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2333,7 +2293,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2349,7 +2309,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2370,7 +2330,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2389,7 +2349,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2410,7 +2370,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2422,7 +2382,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2434,7 +2394,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2445,9 +2405,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2470,7 +2428,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2486,7 +2444,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; SI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2507,7 +2465,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2526,7 +2484,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2547,7 +2505,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -2559,7 +2517,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2571,7 +2529,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2582,9 +2540,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2608,7 +2564,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2627,7 +2583,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2649,7 +2605,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 32 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -2669,7 +2625,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_not_b32_e32 v4, 31 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -2691,7 +2647,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2702,7 +2658,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2713,9 +2669,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2738,7 +2692,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2754,7 +2708,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2775,7 +2729,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2794,7 +2748,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2815,7 +2769,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2826,7 +2780,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2837,9 +2791,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2862,7 +2814,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -2880,7 +2832,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -2900,7 +2852,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -2919,7 +2871,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -2941,7 +2893,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -2952,7 +2904,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -2963,9 +2915,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -2989,7 +2939,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3008,7 +2958,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3030,7 +2980,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, -16 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3050,7 +3000,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, -16 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3072,7 +3022,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3083,7 +3033,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3094,9 +3044,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3119,7 +3067,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3135,7 +3083,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3156,7 +3104,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3175,7 +3123,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3196,7 +3144,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3207,7 +3155,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3218,9 +3166,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3243,7 +3189,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3261,7 +3207,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3281,7 +3227,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -3300,7 +3246,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -3322,7 +3268,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3333,7 +3279,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3344,9 +3290,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3369,7 +3313,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3388,7 +3332,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3410,7 +3354,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3430,7 +3374,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3452,7 +3396,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3464,7 +3408,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3476,7 +3420,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3487,7 +3431,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3498,9 +3442,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3513,9 +3455,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3538,7 +3478,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3557,7 +3497,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3579,7 +3519,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3599,7 +3539,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3621,7 +3561,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3633,7 +3573,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3645,7 +3585,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -3656,7 +3596,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -3667,9 +3607,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3682,9 +3620,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3707,7 +3643,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3726,7 +3662,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3748,7 +3684,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3768,7 +3704,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3790,7 +3726,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3801,7 +3737,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3812,9 +3748,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3837,7 +3771,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3856,7 +3790,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -3878,7 +3812,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -3898,7 +3832,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -3920,7 +3854,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -3931,7 +3865,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -3942,9 +3876,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; ; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -3967,7 +3899,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3984,7 +3916,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -4003,7 +3935,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4021,7 +3953,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4043,7 +3975,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -4054,7 +3986,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] @@ -4065,9 +3997,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4090,7 +4020,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 ; SI-SDAG-NEXT: s_mov_b32 s6, 0 ; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -4107,7 +4037,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; SI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s6, 0 @@ -4124,7 +4054,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -4141,7 +4071,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -4163,7 +4093,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -4174,7 +4104,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_not_b32_e32 v2, 31 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -4186,7 +4116,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] @@ -4197,7 +4127,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] @@ -4208,9 +4138,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] @@ -4223,9 +4151,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ; ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index 9f3596359a6625..1ab63762ecbd72 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -5,22 +5,22 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: break_inserted_outside_of_loop: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s0, v0 +; SI-NEXT: v_and_b32_e32 v0, s2, v0 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: .LBB0_1: ; %ENDIF ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_and_b64 s[4:5], exec, vcc -; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execnz .LBB0_1 ; SI-NEXT: ; %bb.2: ; %ENDLOOP -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -30,22 +30,22 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, ; ; FLAT-LABEL: break_inserted_outside_of_loop: ; FLAT: ; %bb.0: ; %main_body -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x2c +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_and_b32_e32 v0, s0, v0 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v0 ; FLAT-NEXT: v_and_b32_e32 v0, 1, v0 ; FLAT-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; FLAT-NEXT: s_mov_b64 s[0:1], 0 +; FLAT-NEXT: s_mov_b64 s[2:3], 0 ; FLAT-NEXT: .LBB0_1: ; %ENDIF ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 ; FLAT-NEXT: s_and_b64 s[4:5], exec, vcc -; FLAT-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] ; FLAT-NEXT: s_cbranch_execnz .LBB0_1 ; FLAT-NEXT: ; %bb.2: ; %ENDLOOP -; FLAT-NEXT: s_or_b64 exec, exec, s[0:1] -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; FLAT-NEXT: s_or_b64 exec, exec, s[2:3] +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: v_mov_b32_e32 v0, 0 @@ -71,23 +71,23 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SI-NEXT: s_cbranch_execz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s2, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s2, 0 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], s[2:3], exec +; SI-NEXT: s_cmp_eq_u32 s0, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], s[0:1], exec ; SI-NEXT: .LBB1_2: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB1_3: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 s[2:3], exec, s[4:5] -; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; SI-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: ; %bb.4: ; %exit ; SI-NEXT: s_endpgm @@ -96,23 +96,23 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; FLAT: ; %bb.0: ; %entry ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; FLAT-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; FLAT-NEXT: s_mov_b64 s[0:1], 0 +; FLAT-NEXT: s_mov_b64 s[2:3], 0 ; FLAT-NEXT: s_mov_b64 s[4:5], 0 ; FLAT-NEXT: s_and_saveexec_b64 s[6:7], vcc ; FLAT-NEXT: s_cbranch_execz .LBB1_2 ; FLAT-NEXT: ; %bb.1: ; %else -; FLAT-NEXT: s_load_dword s2, s[2:3], 0x24 +; FLAT-NEXT: s_load_dword s0, s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_eq_u32 s2, 0 -; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0 -; FLAT-NEXT: s_and_b64 s[4:5], s[2:3], exec +; FLAT-NEXT: s_cmp_eq_u32 s0, 0 +; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0 +; FLAT-NEXT: s_and_b64 s[4:5], s[0:1], exec ; FLAT-NEXT: .LBB1_2: ; %endif ; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] ; FLAT-NEXT: .LBB1_3: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_and_b64 s[2:3], exec, s[4:5] -; FLAT-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; FLAT-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] ; FLAT-NEXT: s_cbranch_execnz .LBB1_3 ; FLAT-NEXT: ; %bb.4: ; %exit ; FLAT-NEXT: s_endpgm @@ -166,12 +166,12 @@ declare float @llvm.fabs.f32(float) nounwind readnone define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { ; SI-LABEL: loop_land_info_assert: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xa +; SI-NEXT: s_load_dword s2, s[0:1], 0xa ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s0, 4 +; SI-NEXT: s_cmp_lt_i32 s2, 4 ; SI-NEXT: s_cbranch_scc1 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %for.cond.preheader -; SI-NEXT: s_load_dword s0, s[2:3], 0xc +; SI-NEXT: s_load_dword s0, s[0:1], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 @@ -186,12 +186,12 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; ; FLAT-LABEL: loop_land_info_assert: ; FLAT: ; %bb.0: ; %entry -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x28 +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x28 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_lt_i32 s0, 4 +; FLAT-NEXT: s_cmp_lt_i32 s2, 4 ; FLAT-NEXT: s_cbranch_scc1 .LBB3_4 ; FLAT-NEXT: ; %bb.1: ; %for.cond.preheader -; FLAT-NEXT: s_load_dword s0, s[2:3], 0x30 +; FLAT-NEXT: s_load_dword s0, s[0:1], 0x30 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmpk_lt_i32 s0, 0x3e8 ; FLAT-NEXT: s_cbranch_scc0 .LBB3_4 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll index e64dcb74267dd9..7c5537747dd7b1 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 1d183210f95380..f9a17783f0d352 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -43,14 +43,14 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[6:7], 0x10 -; CHECK-NEXT: s_load_dword s10, s[6:7], 0x0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10 +; CHECK-NEXT: s_load_dword s10, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmpk_lg_i32 s0, 0x100 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_6 ; CHECK-NEXT: ; %bb.1: ; %if.else ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[6:7], 0 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -65,7 +65,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_and_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: .LBB0_5: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_8 ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_6: @@ -77,15 +77,15 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], -1 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_13 ; CHECK-NEXT: .LBB0_8: ; %Flow4 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; CHECK-NEXT: .LBB0_9: ; %UnifiedUnreachableBlock ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_10: ; %Flow6 -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; %if.end6.sink.split -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, s10 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -96,14 +96,13 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec ; CHECK-NEXT: s_trap 2 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; CHECK-NEXT: s_cbranch_execnz .LBB0_9 ; CHECK-NEXT: s_branch .LBB0_10 ; CHECK-NEXT: .LBB0_14: ; %cond.false.i8 ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 - entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index a65ad2cddb861e..1ab4be57621daa 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -468,4 +468,4 @@ entry: } attributes #0 = { nounwind } -attributes #1 = { nounwind noinline "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index b54df3b4d0c6c6..9a03d216c7a99d 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,7 +5,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -39,13 +39,15 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s6 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: s_add_i32 s4, s4, s8 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -54,13 +56,15 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_add_i32 s4, s4, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mul_i32 s4, s6, s7 +; VI-NEXT: s_add_i32 s4, s4, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_ashr_i32 s5, s4, 31 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -77,7 +81,7 @@ entry: define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,7 +96,7 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -113,8 +117,8 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -126,8 +130,8 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun ; ; VI-LABEL: s_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -144,7 +148,7 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -162,7 +166,7 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_sext_i32_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -186,8 +190,8 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -199,8 +203,8 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun ; ; VI-LABEL: s_sext_i16_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -217,7 +221,7 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -231,7 +235,7 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -255,8 +259,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -271,8 +275,8 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: s_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -295,13 +299,15 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s5, s6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_cmp_eq_u32 s7, s8 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -310,13 +316,15 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s5, s6 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_cmp_eq_u32 s7, s8 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -342,8 +350,8 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -367,8 +375,8 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n ; ; VI-LABEL: s_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -407,7 +415,7 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -435,7 +443,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs ; ; VI-LABEL: v_sext_v4i8_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -479,7 +487,7 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -505,7 +513,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) ; ; VI-LABEL: s_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -544,7 +552,7 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -572,7 +580,7 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr ; ; VI-LABEL: v_sext_v4i16_to_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index e86ee1adef3d03..539cfc71a80f93 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -42,30 +42,25 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 -; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x4 ; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s8, s8, 16 -; GFX9-NEXT: s_mul_i32 s8, s8, s9 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, indirect@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, indirect@rel32@hi+12 -; GFX9-NEXT: v_mad_u32_u24 v3, v1, s9, v3 -; GFX9-NEXT: v_add_lshl_u32 v5, v3, v2, 3 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX9-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-NEXT: ds_write_b64 v5, v[3:4] -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0 +; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: ds_write_b64 v0, v[3:4] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: s_endpgm %fptr = alloca ptr, addrspace(5) %fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll index ba1caf376975c5..5a241f85b2e2c8 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -837,5 +837,5 @@ entry: ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind } -; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) } attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index b8721129222043..d1f05358ff13af 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -7,8 +7,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i32_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -18,8 +18,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: sint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -36,8 +36,8 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: sint_to_fp_i1_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -50,8 +50,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: sint_to_fp_i1_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -70,8 +70,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) { ; CI-LABEL: sint_to_fp_i1_f64_load: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -84,8 +84,8 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; ; VI-LABEL: sint_to_fp_i1_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -116,7 +116,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -134,7 +134,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CI-LABEL: v_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -153,7 +153,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; CI-LABEL: s_sint_to_fp_i8_to_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -193,8 +193,8 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_sint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i32 s2, s2, 0x80000 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -230,8 +230,8 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -244,8 +244,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -281,8 +281,8 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -295,8 +295,8 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_sint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -351,8 +351,8 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; CI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dword s2, s[6:7], 0x2 -; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 @@ -365,8 +365,8 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index 3b35b2d3d9865f..b03726817c1b48 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -32,7 +32,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -54,7 +54,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -87,7 +87,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -116,7 +116,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -146,15 +146,14 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 -; GFX11-NEXT: v_cls_i32_e32 v4, v1 +; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2 +; GFX11-NEXT: v_cls_i32_e32 v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 @@ -162,17 +161,16 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -188,7 +186,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -211,7 +209,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s5, s2, s3 ; GFX8-NEXT: s_flbit_i32 s4, s3 @@ -232,7 +230,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s4, s2, s3 @@ -263,7 +261,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -291,7 +289,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_sint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -320,16 +318,14 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_sint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, v0, v1 -; GFX11-NEXT: v_cls_i32_e32 v4, v1 +; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2 +; GFX11-NEXT: v_cls_i32_e32 v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v4 @@ -337,15 +333,15 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; GFX11-NEXT: v_add_nc_u32_e32 v3, 32, v3 ; GFX11-NEXT: v_min_u32_e32 v3, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -361,8 +357,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -395,8 +391,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s3, s6, s7 ; GFX8-NEXT: s_flbit_i32 s2, s7 @@ -430,8 +426,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s6, s7 @@ -471,7 +467,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -538,7 +534,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -607,24 +603,22 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 -; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 +; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4 +; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 -; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 -; GFX11-NEXT: v_cls_i32_e32 v10, v3 -; GFX11-NEXT: v_cls_i32_e32 v12, v1 -; GFX11-NEXT: v_cls_i32_e32 v14, v7 -; GFX11-NEXT: v_cls_i32_e32 v16, v5 +; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8 +; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6 +; GFX11-NEXT: v_cls_i32_e32 v10, v4 +; GFX11-NEXT: v_cls_i32_e32 v12, v2 +; GFX11-NEXT: v_cls_i32_e32 v14, v8 +; GFX11-NEXT: v_cls_i32_e32 v16, v6 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 @@ -644,33 +638,33 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 -; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: v_cvt_f32_i32_e32 v6, v2 +; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v2, v1, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 +; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 +; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -686,8 +680,8 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -724,8 +718,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s3, s6, s7 ; GFX8-NEXT: s_flbit_i32 s2, s7 @@ -762,8 +756,8 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_xor_b32 s3, s6, s7 @@ -808,7 +802,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -883,7 +877,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -958,24 +952,22 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_xor_b32_e32 v9, v2, v3 -; GFX11-NEXT: v_xor_b32_e32 v11, v0, v1 +; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4 +; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v13, v6, v7 -; GFX11-NEXT: v_xor_b32_e32 v15, v4, v5 -; GFX11-NEXT: v_cls_i32_e32 v10, v3 -; GFX11-NEXT: v_cls_i32_e32 v12, v1 -; GFX11-NEXT: v_cls_i32_e32 v14, v7 -; GFX11-NEXT: v_cls_i32_e32 v16, v5 +; GFX11-NEXT: v_xor_b32_e32 v13, v7, v8 +; GFX11-NEXT: v_xor_b32_e32 v15, v5, v6 +; GFX11-NEXT: v_cls_i32_e32 v10, v4 +; GFX11-NEXT: v_cls_i32_e32 v12, v2 +; GFX11-NEXT: v_cls_i32_e32 v14, v8 +; GFX11-NEXT: v_cls_i32_e32 v16, v6 ; GFX11-NEXT: v_ashrrev_i32_e32 v9, 31, v9 ; GFX11-NEXT: v_ashrrev_i32_e32 v11, 31, v11 ; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v13 @@ -995,41 +987,41 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-NEXT: v_min_u32_e32 v11, v14, v13 ; GFX11-NEXT: v_min_u32_e32 v12, v16, v15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 ; GFX11-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 -; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_i32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GFX11-NEXT: v_ldexp_f32 v2, v2, v11 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 -; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index b08a35ab807324..b4b0d960e12e56 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; VI-LABEL: sitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; ; GFX11-LABEL: sitofp_i16_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @sitofp_i32_to_f16( ; SI-LABEL: sitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; VI-LABEL: sitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; ; GFX11-LABEL: sitofp_i32_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ entry: define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; SI-LABEL: sitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -168,7 +168,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; VI-LABEL: sitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -188,7 +188,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; ; GFX11-LABEL: sitofp_v2i16_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ entry: define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; SI-LABEL: sitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -244,7 +244,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; VI-LABEL: sitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -266,7 +266,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; ; GFX11-LABEL: sitofp_v2i32_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -301,21 +301,19 @@ entry: define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -323,26 +321,26 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -350,14 +348,16 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_sint_to_fp_i1_to_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index fbb9ba0b73846e..233f4cc4fee501 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -2,10 +2,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX940 %s -define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX940-LABEL: test: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v0 ; GFX940-NEXT: v_mov_b32_e32 v3, v0 @@ -51,5 +51,3 @@ entry: ret void } declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32 immarg, i32 immarg) - -attributes #0 = { "amdgpu-no-agpr" } diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 93e210bb4c8090..f8c9827ecf7a99 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -106,7 +106,7 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) } ; GCN-LABEL: {{^}}s_abs_v4i16: -; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x24 +; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x24 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[#LOAD + 2]] ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[#LOAD + 3]] ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[#LOAD + 2]], [[SUB0]] diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll index c54832d778434c..8b166b4c1bf3ff 100644 --- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll @@ -333,7 +333,7 @@ endif: } ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16: -; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff8000 +; GCN: s_cmp_lt_u32 s2, 0xffff8000 define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, ptr addrspace(1) %out) #0 { entry: %cmp0 = icmp ult i32 %cond, -32768 @@ -552,7 +552,7 @@ endif: } ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16: -; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xfffff7ff +; GCN: s_cmp_lt_u32 s2, 0xfffff7ff define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, ptr addrspace(1) %out) #0 { entry: %size = call i32 @llvm.amdgcn.groupstaticsize() diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 804fb8f258ffd4..c9413b61758d14 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -121,7 +121,7 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float> declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) -attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } -attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } +attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } +attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } +attributes #4 = { nounwind "amdgpu-num-vgpr"="6" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index daf0a2d1baa88a..b35f3ed28fc5b7 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; MUBUF-LABEL: test_inst_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -24,8 +24,8 @@ define amdgpu_kernel void @test_inst_offset_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -61,7 +61,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_kernel() { ; MUBUF-LABEL: test_sgpr_offset_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -77,8 +77,8 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -193,7 +193,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -215,8 +215,8 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { ; ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s8, 0 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND @@ -275,7 +275,7 @@ entry: define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; MUBUF-LABEL: test_sgpr_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -298,8 +298,8 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -341,7 +341,7 @@ entry: define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; MUBUF-LABEL: test_inst_offset_subregs_kernel: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_add_u32 s0, s0, s15 +; MUBUF-NEXT: s_add_u32 s0, s0, s7 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -365,8 +365,8 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() { ; ; FLATSCR-LABEL: test_inst_offset_subregs_kernel: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 0e68c306fb8b8e..27e4ba593aa4d4 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -16,7 +16,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s42, -1 +; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX6-NEXT: s_add_u32 s40, s40, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -29,11 +34,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v0, vcc ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 -; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_mov_b32 s42, -1 -; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s9 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: s_mov_b32 s2, 0x3fd00 ; GFX6-NEXT: s_mov_b64 s[8:9], 0x100 @@ -4668,7 +4668,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX9-FLATSCR-LABEL: test: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -4680,8 +4682,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968 -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill @@ -7294,11 +7294,11 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; ; GFX10-FLATSCR-LABEL: test: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 -; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s2, s2, s5 +; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0 @@ -9752,10 +9752,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 ; GFX6-NEXT: s_mov_b32 s42, -1 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 -; GFX6-NEXT: s_add_u32 s40, s40, s9 +; GFX6-NEXT: s_add_u32 s40, s40, s3 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 @@ -10312,14 +10312,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: ; GFX9-FLATSCR: ; %bb.0: ; %entry -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 -; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:240 -; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) @@ -10496,11 +10496,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; ; GFX10-FLATSCR-LABEL: test_limited_sgpr: ; GFX10-FLATSCR: ; %bb.0: ; %entry -; GFX10-FLATSCR-NEXT: s_add_u32 s6, s6, s11 -; GFX10-FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[2:3], 0x24 +; GFX10-FLATSCR-NEXT: s_add_u32 s2, s2, s5 +; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; GFX10-FLATSCR-NEXT: s_mov_b32 s33, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll index 5338bc8f7aa7ac..f5e94df415ae4f 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; The test was originally written to spill an SGPR to scratch without -; having spare SGPRs available to save exec. This scenario won't be -; true anymore as we reserve SGPR(s) upfront for saving exec. +; The test was originally written to spill an SGPR to scratch without having spare SGPRs +; available to save exec. This scenario won't be true anymore as we reseve SGPR(s) +; upfront for saving exec. define amdgpu_kernel void @test() #1 { ; GFX10-LABEL: test: @@ -12,7 +12,7 @@ define amdgpu_kernel void @test() #1 { ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX10-NEXT: s_add_u32 s8, s8, s7 +; GFX10-NEXT: s_add_u32 s8, s8, s1 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[0:7] @@ -37,8 +37,5 @@ define amdgpu_kernel void @test() #1 { ret void } -; FIXME: amdgpu-no attributese are a workaround for cases where the -; number of incoming arguments is larger than the number of permitted -; registers. attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" "amdgpu-no-queue-ptr" "amdgpu-no-dispatch-id" } +attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index 882356d994fc68..be3e0b4cbd63f5 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -5,17 +5,17 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-LABEL: name: test_spill_av_class ; GCN: bb.0 (%ir-block.0): - ; GCN-NEXT: liveins: $sgpr6_sgpr7 + ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) + ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %12.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %6.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %12 + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %6 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll index bc13b8d0330177..c1c69ce568a9c4 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -44,7 +44,7 @@ define void @device_writelane_intrinsic(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @kernel_writelane_intrinsic(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GCN-LABEL: kernel_writelane_intrinsic: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v1, 45 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index cd06a060a50cd8..b8cf692372069a 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -27,7 +27,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -94,7 +94,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -147,7 +147,7 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -175,7 +175,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -243,7 +243,7 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -282,7 +282,7 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -395,8 +395,8 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_ashr_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -409,8 +409,8 @@ define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { ; ; VI-LABEL: s_ashr_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -443,7 +443,7 @@ entry: define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_i64_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -461,7 +461,7 @@ define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_i64_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -513,7 +513,7 @@ entry: define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -533,7 +533,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -597,7 +597,7 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: ashr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -623,7 +623,7 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: ashr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s10, s2 @@ -714,9 +714,9 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -730,9 +730,9 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -768,7 +768,7 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -785,7 +785,7 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -833,9 +833,9 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { ; SI-LABEL: s_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -849,9 +849,9 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x74 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -887,7 +887,7 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_ashr_63_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -905,7 +905,7 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_ashr_63_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index abf013e39eefa7..bcc67e974ae4a2 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i16_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[2:3] @@ -25,7 +25,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i16_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -49,7 +49,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i16_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -113,7 +113,7 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -149,7 +149,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TAHITI-LABEL: srem_i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -192,7 +192,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -277,7 +277,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -314,7 +314,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -363,7 +363,7 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i32_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dword v1, v0, s[2:3] @@ -381,7 +381,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i32_7: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -406,7 +406,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i32_7: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -459,7 +459,7 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -521,7 +521,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v2i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -590,7 +590,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -723,7 +723,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -747,7 +747,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v2i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -778,7 +778,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -842,7 +842,7 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] offset:16 @@ -958,7 +958,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v4i32: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1081,7 +1081,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s4, s2, 16 ; TONGA-NEXT: s_addc_u32 s5, s3, 0 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i32_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -1355,7 +1355,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v4i32_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -1491,7 +1491,7 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] @@ -1675,7 +1675,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TAHITI-LABEL: srem_i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v4, 0 @@ -1836,7 +1836,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v4, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s6 @@ -2589,7 +2589,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -2606,7 +2606,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -2630,7 +2630,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -2684,7 +2684,7 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 @@ -3039,7 +3039,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v2i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -3346,7 +3346,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v2i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 16 @@ -4733,7 +4733,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -4757,7 +4757,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v2i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s7, 0xf000 ; TAHITI-NEXT: s_mov_b32 s6, -1 ; TAHITI-NEXT: s_mov_b32 s10, s6 @@ -4788,7 +4788,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v2i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 @@ -4860,7 +4860,7 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[10:13], v8, s[6:7] offset:32 @@ -5486,7 +5486,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TAHITI-LABEL: srem_v4i64: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: v_mov_b32_e32 v8, 0 @@ -6088,7 +6088,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: srem_v4i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 @@ -8883,7 +8883,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64_4: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] @@ -8924,7 +8924,7 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TAHITI-LABEL: srem_v4i64_4: ; TAHITI: ; %bb.0: -; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; TAHITI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; TAHITI-NEXT: s_mov_b32 s3, 0xf000 ; TAHITI-NEXT: s_mov_b32 s2, -1 ; TAHITI-NEXT: s_mov_b32 s10, s2 @@ -8972,7 +8972,7 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: srem_v4i64_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 8498e9af46f2b5..93fab7dff253bc 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,8 +122,8 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_srem: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -442,8 +442,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -477,8 +477,8 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -519,8 +519,8 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -554,8 +554,8 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -650,14 +650,14 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-NEXT: s_abs_i32 s8, s0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -691,14 +691,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-IR-NEXT: s_abs_i32 s8, s0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -739,14 +739,14 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-NEXT: s_abs_i32 s8, s0 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -780,14 +780,14 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s1, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-IR-NEXT: s_abs_i32 s8, s0 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -829,18 +829,18 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_abs_i32 s8, s0 +; GCN-NEXT: s_abs_i32 s8, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_abs_i32 s2, s3 @@ -868,18 +868,18 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_abs_i32 s8, s0 +; GCN-IR-NEXT: s_abs_i32 s8, s2 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_abs_i32 s2, s3 @@ -915,8 +915,8 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1049,8 +1049,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem33_64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 @@ -1153,8 +1153,8 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1189,22 +1189,22 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-IR-NEXT: s_mov_b32 s1, s0 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 16 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s4, s4, s0 -; GCN-IR-NEXT: s_subb_u32 s5, s5, s0 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 +; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 ; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] @@ -1271,20 +1271,20 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GCN-IR-NEXT: s_mul_i32 s2, s6, s11 +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s11 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s7, s10 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mul_i32 s2, s6, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s7, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s10 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s3, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 ; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s14, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc @@ -1302,7 +1302,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1411,7 +1411,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_srem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s8, s3, 31 @@ -1984,7 +1984,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -2016,7 +2016,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -2054,7 +2054,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -2085,7 +2085,7 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 03d1dddd7b6061..418c160d4244af 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -26,7 +26,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -64,7 +64,7 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -83,7 +83,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -124,7 +124,7 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -146,7 +146,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -194,7 +194,7 @@ define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -212,7 +212,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: lshr_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 @@ -258,7 +258,7 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: lshr_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -284,7 +284,7 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: lshr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; VI-NEXT: s_mov_b32 s19, 0xf000 @@ -370,8 +370,8 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: s_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0x14 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x14 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -382,8 +382,8 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % ; ; VI-LABEL: s_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x50 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x50 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -411,7 +411,7 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 % define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_lshr_32_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -428,7 +428,7 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_lshr_32_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 132775d81ca1ad..9ad9fa03048655 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -10,12 +10,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %i) { ; MUBUF-LABEL: kernel_background_evaluate: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dword s0, s[2:3], 0x24 +; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 ; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 -; MUBUF-NEXT: s_add_u32 s36, s36, s9 +; MUBUF-NEXT: s_add_u32 s36, s36, s3 ; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 @@ -48,12 +48,12 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR-LABEL: kernel_background_evaluate: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 s6, s6, s11 +; FLATSCR-NEXT: s_add_u32 s2, s2, s5 ; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 -; FLATSCR-NEXT: s_addc_u32 s7, s7, 0 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 -; FLATSCR-NEXT: s_load_dword s2, s[2:3], 0x24 +; FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 @@ -81,7 +81,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; MUBUF11-LABEL: kernel_background_evaluate: ; MUBUF11: ; %bb.0: ; %entry -; MUBUF11-NEXT: s_load_b32 s2, s[2:3], 0x24 +; MUBUF11-NEXT: s_load_b32 s2, s[0:1], 0x24 ; MUBUF11-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000 @@ -108,7 +108,7 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; ; FLATSCR11-LABEL: kernel_background_evaluate: ; FLATSCR11: ; %bb.0: ; %entry -; FLATSCR11-NEXT: s_load_b32 s2, s[2:3], 0x24 +; FLATSCR11-NEXT: s_load_b32 s2, s[0:1], 0x24 ; FLATSCR11-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index 6ddf0986755f95..5c6f0019f1ed93 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-LABEL: max_alignment_128: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s17 +; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -20,23 +20,23 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-NEXT: .amdhsa_kernel max_alignment_128 ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 256 -; VI-NEXT: .amdhsa_kernarg_size 56 -; VI-NEXT: .amdhsa_user_sgpr_count 14 +; VI-NEXT: .amdhsa_kernarg_size 0 +; VI-NEXT: .amdhsa_user_sgpr_count 6 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 18 +; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -57,7 +57,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; ; GFX9-LABEL: max_alignment_128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -71,23 +71,23 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9-NEXT: .amdhsa_kernel max_alignment_128 ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 256 -; GFX9-NEXT: .amdhsa_kernarg_size 56 -; GFX9-NEXT: .amdhsa_user_sgpr_count 14 +; GFX9-NEXT: .amdhsa_kernarg_size 0 +; GFX9-NEXT: .amdhsa_user_sgpr_count 6 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 18 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 @@ -117,7 +117,7 @@ define amdgpu_kernel void @max_alignment_128() #0 { define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-LABEL: stackrealign_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s17 +; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -131,23 +131,23 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-NEXT: .amdhsa_kernel stackrealign_attr ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 12 -; VI-NEXT: .amdhsa_kernarg_size 56 -; VI-NEXT: .amdhsa_user_sgpr_count 14 +; VI-NEXT: .amdhsa_kernarg_size 0 +; VI-NEXT: .amdhsa_user_sgpr_count 6 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 18 +; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -168,7 +168,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; ; GFX9-LABEL: stackrealign_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -182,23 +182,23 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-NEXT: .amdhsa_kernel stackrealign_attr ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12 -; GFX9-NEXT: .amdhsa_kernarg_size 56 -; GFX9-NEXT: .amdhsa_user_sgpr_count 14 +; GFX9-NEXT: .amdhsa_kernarg_size 0 +; GFX9-NEXT: .amdhsa_user_sgpr_count 6 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 18 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 @@ -228,7 +228,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 { define amdgpu_kernel void @alignstack_attr() #2 { ; VI-LABEL: alignstack_attr: ; VI: ; %bb.0: -; VI-NEXT: s_add_u32 s0, s0, s17 +; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -242,23 +242,23 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; VI-NEXT: .amdhsa_kernel alignstack_attr ; VI-NEXT: .amdhsa_group_segment_fixed_size 0 ; VI-NEXT: .amdhsa_private_segment_fixed_size 128 -; VI-NEXT: .amdhsa_kernarg_size 56 -; VI-NEXT: .amdhsa_user_sgpr_count 14 +; VI-NEXT: .amdhsa_kernarg_size 0 +; VI-NEXT: .amdhsa_user_sgpr_count 6 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; VI-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; VI-NEXT: .amdhsa_next_free_vgpr 1 -; VI-NEXT: .amdhsa_next_free_sgpr 18 +; VI-NEXT: .amdhsa_next_free_sgpr 8 ; VI-NEXT: .amdhsa_reserve_vcc 0 ; VI-NEXT: .amdhsa_reserve_flat_scratch 0 ; VI-NEXT: .amdhsa_float_round_mode_32 0 @@ -279,7 +279,7 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; ; GFX9-LABEL: alignstack_attr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 s0, s0, s7 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -293,23 +293,23 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9-NEXT: .amdhsa_kernel alignstack_attr ; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0 ; GFX9-NEXT: .amdhsa_private_segment_fixed_size 128 -; GFX9-NEXT: .amdhsa_kernarg_size 56 -; GFX9-NEXT: .amdhsa_user_sgpr_count 14 +; GFX9-NEXT: .amdhsa_kernarg_size 0 +; GFX9-NEXT: .amdhsa_user_sgpr_count 6 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 ; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 -; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 ; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2 +; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; GFX9-NEXT: .amdhsa_next_free_vgpr 1 -; GFX9-NEXT: .amdhsa_next_free_sgpr 18 +; GFX9-NEXT: .amdhsa_next_free_sgpr 8 ; GFX9-NEXT: .amdhsa_reserve_vcc 0 ; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0 ; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index 7cd67f2adbd152..ad2fc2c2dbe5bd 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -121,31 +121,31 @@ define amdgpu_kernel void @kernel_store_stacksave() { define amdgpu_kernel void @kernel_store_stacksave_nocall() { ; WAVE32-OPT-LABEL: kernel_store_stacksave_nocall: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_getpc_b64 s[12:13] -; WAVE32-OPT-NEXT: s_mov_b32 s12, s0 +; WAVE32-OPT-NEXT: s_getpc_b64 s[4:5] +; WAVE32-OPT-NEXT: s_mov_b32 s4, s0 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 0 -; WAVE32-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; WAVE32-OPT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-OPT-NEXT: s_bitset0_b32 s15, 21 -; WAVE32-OPT-NEXT: s_add_u32 s12, s12, s9 -; WAVE32-OPT-NEXT: s_addc_u32 s13, s13, 0 +; WAVE32-OPT-NEXT: s_bitset0_b32 s7, 21 +; WAVE32-OPT-NEXT: s_add_u32 s4, s4, s1 +; WAVE32-OPT-NEXT: s_addc_u32 s5, s5, 0 ; WAVE32-OPT-NEXT: s_lshr_b32 s0, s32, 5 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, s0 -; WAVE32-OPT-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; WAVE32-OPT-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; WAVE32-OPT-NEXT: s_endpgm ; ; WAVE64-OPT-LABEL: kernel_store_stacksave_nocall: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_getpc_b64 s[12:13] -; WAVE64-OPT-NEXT: s_mov_b32 s12, s0 +; WAVE64-OPT-NEXT: s_getpc_b64 s[4:5] +; WAVE64-OPT-NEXT: s_mov_b32 s4, s0 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 0 -; WAVE64-OPT-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; WAVE64-OPT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s12, s12, s9 -; WAVE64-OPT-NEXT: s_addc_u32 s13, s13, 0 +; WAVE64-OPT-NEXT: s_add_u32 s4, s4, s1 +; WAVE64-OPT-NEXT: s_addc_u32 s5, s5, 0 ; WAVE64-OPT-NEXT: s_lshr_b32 s0, s32, 6 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, s0 -; WAVE64-OPT-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; WAVE64-OPT-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; WAVE64-OPT-NEXT: s_endpgm ; ; WAVE32-O0-LABEL: kernel_store_stacksave_nocall: @@ -790,7 +790,7 @@ define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) { define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; WAVE32-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE32-OPT-NEXT: s_load_dword s0, s[0:1], 0x0 ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s0 @@ -799,7 +799,7 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { ; ; WAVE64-OPT-LABEL: kernel_stacksave_sgpr: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_load_dword s0, s[2:3], 0x0 +; WAVE64-OPT-NEXT: s_load_dword s0, s[0:1], 0x0 ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s0 @@ -849,72 +849,54 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) { define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-OPT: ; %bb.0: -; WAVE32-OPT-NEXT: s_getpc_b64 s[20:21] -; WAVE32-OPT-NEXT: s_mov_b32 s20, s0 -; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; WAVE32-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 -; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; WAVE32-OPT-NEXT: s_getpc_b64 s[8:9] +; WAVE32-OPT-NEXT: s_mov_b32 s8, s0 ; WAVE32-OPT-NEXT: s_movk_i32 s32, 0x1200 -; WAVE32-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] -; WAVE32-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE32-OPT-NEXT: s_mov_b32 s14, s8 -; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE32-OPT-NEXT: s_mov_b32 s12, s6 -; WAVE32-OPT-NEXT: s_mov_b32 s13, s7 +; WAVE32-OPT-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; WAVE32-OPT-NEXT: s_mov_b32 s0, s32 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 42 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, 17 +; WAVE32-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE32-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo ; WAVE32-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-OPT-NEXT: s_bitset0_b32 s23, 21 -; WAVE32-OPT-NEXT: s_add_u32 s20, s20, s9 -; WAVE32-OPT-NEXT: s_addc_u32 s21, s21, 0 -; WAVE32-OPT-NEXT: s_lshr_b32 s15, s4, 5 -; WAVE32-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] -; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] -; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] -; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] -; WAVE32-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE32-OPT-NEXT: s_bitset0_b32 s11, 21 +; WAVE32-OPT-NEXT: s_add_u32 s8, s8, s1 +; WAVE32-OPT-NEXT: s_addc_u32 s9, s9, 0 +; WAVE32-OPT-NEXT: s_lshr_b32 s6, s0, 5 +; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] +; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] +; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-OPT-NEXT: buffer_store_dword v4, off, s[20:23], s32 offset:4 -; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 +; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] ; WAVE32-OPT-NEXT: ;;#ASMSTART -; WAVE32-OPT-NEXT: ; use s15 +; WAVE32-OPT-NEXT: ; use s6 ; WAVE32-OPT-NEXT: ;;#ASMEND ; WAVE32-OPT-NEXT: s_endpgm ; ; WAVE64-OPT-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-OPT: ; %bb.0: -; WAVE64-OPT-NEXT: s_getpc_b64 s[20:21] -; WAVE64-OPT-NEXT: s_mov_b32 s20, s0 -; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; WAVE64-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 -; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; WAVE64-OPT-NEXT: s_getpc_b64 s[8:9] +; WAVE64-OPT-NEXT: s_mov_b32 s8, s0 ; WAVE64-OPT-NEXT: s_movk_i32 s32, 0x2400 -; WAVE64-OPT-NEXT: s_mov_b64 s[10:11], s[4:5] -; WAVE64-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 -; WAVE64-OPT-NEXT: s_mov_b32 s14, s8 -; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE64-OPT-NEXT: s_mov_b32 s12, s6 -; WAVE64-OPT-NEXT: s_mov_b32 s13, s7 +; WAVE64-OPT-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; WAVE64-OPT-NEXT: s_mov_b32 s0, s32 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 42 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, 17 +; WAVE64-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE64-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo ; WAVE64-OPT-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-OPT-NEXT: s_add_u32 s20, s20, s9 -; WAVE64-OPT-NEXT: s_addc_u32 s21, s21, 0 -; WAVE64-OPT-NEXT: s_lshr_b32 s15, s4, 6 -; WAVE64-OPT-NEXT: s_mov_b64 s[4:5], s[0:1] -; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[2:3] -; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[20:21] -; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[22:23] -; WAVE64-OPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 +; WAVE64-OPT-NEXT: s_add_u32 s8, s8, s1 +; WAVE64-OPT-NEXT: s_addc_u32 s9, s9, 0 +; WAVE64-OPT-NEXT: s_lshr_b32 s6, s0, 6 +; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[8:9] +; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[10:11] +; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE64-OPT-NEXT: buffer_store_dword v4, off, s[20:23], s32 offset:4 -; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4 +; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] ; WAVE64-OPT-NEXT: ;;#ASMSTART -; WAVE64-OPT-NEXT: ; use s15 +; WAVE64-OPT-NEXT: ; use s6 ; WAVE64-OPT-NEXT: ;;#ASMEND ; WAVE64-OPT-NEXT: s_endpgm ; @@ -1262,70 +1244,70 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-OPT: ; %bb.0: ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-OPT-NEXT: s_mov_b32 s20, s33 +; WAVE32-OPT-NEXT: s_mov_b32 s8, s33 ; WAVE32-OPT-NEXT: s_mov_b32 s33, s32 -; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s16, -1 -; WAVE32-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s16 -; WAVE32-OPT-NEXT: v_writelane_b32 v32, s30, 0 +; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 +; WAVE32-OPT-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 +; WAVE32-OPT-NEXT: v_writelane_b32 v31, s30, 0 ; WAVE32-OPT-NEXT: s_addk_i32 s32, 0x1200 -; WAVE32-OPT-NEXT: v_writelane_b32 v32, s31, 1 +; WAVE32-OPT-NEXT: v_writelane_b32 v31, s31, 1 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-OPT-NEXT: v_mov_b32_e32 v1, 17 -; WAVE32-OPT-NEXT: s_mov_b32 s18, s32 -; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE32-OPT-NEXT: s_lshr_b32 s19, s18, 5 +; WAVE32-OPT-NEXT: s_mov_b32 s6, s32 +; WAVE32-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE32-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo +; WAVE32-OPT-NEXT: s_lshr_b32 s7, s6, 5 ; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] +; WAVE32-OPT-NEXT: v_readlane_b32 s30, v31, 0 ; WAVE32-OPT-NEXT: ;;#ASMSTART -; WAVE32-OPT-NEXT: ; use s19 +; WAVE32-OPT-NEXT: ; use s7 ; WAVE32-OPT-NEXT: ;;#ASMEND -; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 -; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1 +; WAVE32-OPT-NEXT: s_mov_b32 s32, s6 +; WAVE32-OPT-NEXT: v_readlane_b32 s31, v31, 1 ; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 -; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE32-OPT-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE32-OPT-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-OPT-NEXT: s_addk_i32 s32, 0xee00 -; WAVE32-OPT-NEXT: s_mov_b32 s33, s20 +; WAVE32-OPT-NEXT: s_mov_b32 s33, s8 ; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-OPT: ; %bb.0: ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-OPT-NEXT: s_mov_b32 s20, s33 +; WAVE64-OPT-NEXT: s_mov_b32 s8, s33 ; WAVE64-OPT-NEXT: s_mov_b32 s33, s32 -; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; WAVE64-OPT-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; WAVE64-OPT-NEXT: s_mov_b64 exec, s[16:17] -; WAVE64-OPT-NEXT: v_writelane_b32 v32, s30, 0 +; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; WAVE64-OPT-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] +; WAVE64-OPT-NEXT: v_writelane_b32 v31, s30, 0 ; WAVE64-OPT-NEXT: s_addk_i32 s32, 0x2400 -; WAVE64-OPT-NEXT: v_writelane_b32 v32, s31, 1 +; WAVE64-OPT-NEXT: v_writelane_b32 v31, s31, 1 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v0, 42 ; WAVE64-OPT-NEXT: v_mov_b32_e32 v1, 17 -; WAVE64-OPT-NEXT: s_mov_b32 s18, s32 -; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi -; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo -; WAVE64-OPT-NEXT: s_lshr_b32 s19, s18, 6 +; WAVE64-OPT-NEXT: s_mov_b32 s6, s32 +; WAVE64-OPT-NEXT: s_mov_b32 s5, stack_passed_argument@abs32@hi +; WAVE64-OPT-NEXT: s_mov_b32 s4, stack_passed_argument@abs32@lo +; WAVE64-OPT-NEXT: s_lshr_b32 s7, s6, 6 ; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5] +; WAVE64-OPT-NEXT: v_readlane_b32 s30, v31, 0 ; WAVE64-OPT-NEXT: ;;#ASMSTART -; WAVE64-OPT-NEXT: ; use s19 +; WAVE64-OPT-NEXT: ; use s7 ; WAVE64-OPT-NEXT: ;;#ASMEND -; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 -; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1 +; WAVE64-OPT-NEXT: s_mov_b32 s32, s6 +; WAVE64-OPT-NEXT: v_readlane_b32 s31, v31, 1 ; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; WAVE64-OPT-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload ; WAVE64-OPT-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-OPT-NEXT: s_addk_i32 s32, 0xdc00 -; WAVE64-OPT-NEXT: s_mov_b32 s33, s20 +; WAVE64-OPT-NEXT: s_mov_b32 s33, s8 ; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) ; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index f7eb760fda084f..01ad9665971394 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -8,10 +8,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -21,8 +21,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; ; GFX7-LABEL: store_lds_v4i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -35,8 +35,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; ; GFX6-LABEL: store_lds_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 @@ -50,10 +50,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX10-LABEL: store_lds_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -64,8 +64,8 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX11-LABEL: store_lds_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 @@ -79,10 +79,10 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 @@ -123,8 +123,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -176,8 +176,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -230,10 +230,10 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: s_lshr_b32 s3, s6, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -275,8 +275,8 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_lshr_b32 s4, s3, 8 @@ -317,10 +317,10 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 @@ -337,8 +337,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -366,8 +366,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -396,10 +396,10 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 @@ -417,8 +417,8 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -439,10 +439,10 @@ define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 @@ -453,8 +453,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -468,8 +468,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -484,10 +484,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -499,8 +499,8 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 @@ -515,10 +515,10 @@ define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -528,8 +528,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; ; GFX7-LABEL: store_lds_v4i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -542,8 +542,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; ; GFX6-LABEL: store_lds_v4i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -557,12 +557,12 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX10-LABEL: store_lds_v4i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 @@ -571,8 +571,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-LABEL: store_lds_v4i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -587,10 +587,10 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -600,8 +600,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; ; GFX7-LABEL: store_lds_v4i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -614,8 +614,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; ; GFX6-LABEL: store_lds_v4i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX6-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 @@ -629,10 +629,10 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX10-LABEL: store_lds_v4i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 @@ -643,8 +643,8 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX11-LABEL: store_lds_v4i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v4, s4 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll index 64ce67a1a3deeb..507b411996d973 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -8,20 +8,20 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -33,8 +33,8 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; ; GFX6-LABEL: store_lds_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -48,21 +48,21 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) ; GFX10-LABEL: store_lds_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 @@ -75,10 +75,10 @@ define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 @@ -110,8 +110,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -152,8 +152,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -195,10 +195,10 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -231,8 +231,8 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 @@ -265,10 +265,10 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 @@ -282,8 +282,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -306,8 +306,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -331,10 +331,10 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 @@ -349,8 +349,8 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 @@ -368,10 +368,10 @@ define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 @@ -381,8 +381,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align4: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -395,8 +395,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -410,10 +410,10 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 @@ -424,8 +424,8 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 @@ -439,10 +439,10 @@ define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -452,8 +452,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX7-LABEL: store_lds_v3i32_align8: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 @@ -466,8 +466,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; ; GFX6-LABEL: store_lds_v3i32_align8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -481,10 +481,10 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX10-LABEL: store_lds_v3i32_align8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 @@ -495,8 +495,8 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 ; GFX11-LABEL: store_lds_v3i32_align8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -510,20 +510,20 @@ define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i3 define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32_align16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: ds_write_b96 v3, v[0:2] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4 -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -535,8 +535,8 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; ; GFX6-LABEL: store_lds_v3i32_align16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -550,21 +550,21 @@ define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i ; GFX10-LABEL: store_lds_v3i32_align16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 3644bef9c20a1f..f88aaf389ca9ae 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,12 +50,12 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s6, 14 +; HAWAII-NEXT: s_or_b32 s0, s4, 14 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s7 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v1, s2 @@ -70,12 +70,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s6, 14 +; FIJI-NEXT: s_or_b32 s0, s4, 14 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s7 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s3, s1, 0xffff @@ -94,9 +94,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -114,9 +114,9 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[6:7] offset:14 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s3, s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, s2 @@ -133,16 +133,16 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX11-LABEL: local_store_i55: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[2:3] offset:14 +; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[0:1] offset:14 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s3, s1, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_mov_b32_e32 v3, s0 +; GFX11-NEXT: s_and_b32 s1, s3, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 ; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 @@ -156,8 +156,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s2 @@ -169,8 +169,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s2 @@ -182,8 +182,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -195,8 +195,8 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX10-LABEL: local_store_i48: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -208,10 +208,10 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 @@ -223,9 +223,9 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[6:7], 0x4 -; HAWAII-NEXT: s_load_dword s3, s[6:7], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x4 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_and_b32 s2, s2, 1 @@ -239,9 +239,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[6:7], 0x10 -; FIJI-NEXT: s_load_dword s3, s[6:7], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x10 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_and_b32 s2, s2, 1 @@ -255,9 +255,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -271,9 +271,9 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX10-LABEL: local_store_i65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x10 -; GFX10-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x10 +; GFX10-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 @@ -287,13 +287,13 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX11-LABEL: local_store_i65: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 -; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s4, 1 +; GFX11-NEXT: s_and_b32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: ds_store_b8 v2, v3 offset:8 ; GFX11-NEXT: ds_store_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 60448735632548..ded308ae4f2307 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX6-LABEL: s_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX8-LABEL: s_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_i32 s2, s2, s3 @@ -43,7 +43,7 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -60,8 +60,8 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; GFX6-LABEL: s_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -72,10 +72,10 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX8-LABEL: s_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s4 +; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -84,18 +84,18 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX9-LABEL: s_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_sub_i32 s0, 0x4d2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -112,7 +112,7 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -130,7 +130,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -144,7 +144,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_imm_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -194,7 +194,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: test_sub_imm_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -208,7 +208,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: test_sub_imm_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] @@ -219,7 +219,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace ; ; GFX12-LABEL: test_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] @@ -238,7 +238,7 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -257,7 +257,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -272,7 +272,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -284,7 +284,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] @@ -306,7 +306,7 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -328,7 +328,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -350,7 +350,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 @@ -365,7 +365,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 @@ -391,7 +391,7 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -412,7 +412,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX8-LABEL: test_sub_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -432,7 +432,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: test_sub_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc @@ -446,11 +446,9 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX12-LABEL: test_sub_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -474,7 +472,7 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -499,7 +497,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -517,7 +515,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -529,11 +527,9 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v2i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -555,7 +551,7 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test_sub_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_mov_b32 s11, s7 @@ -587,7 +583,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX8-LABEL: test_sub_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -608,7 +604,7 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX9-LABEL: test_sub_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -621,11 +617,9 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX12-LABEL: test_sub_v4i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -648,8 +642,8 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -662,8 +656,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sub_u32 s2, s4, s6 ; GFX8-NEXT: s_subb_u32 s3, s5, s7 @@ -676,22 +670,22 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s2, s4, s6 -; GFX9-NEXT: s_subb_u32 s3, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_sub_u32 s0, s4, s6 +; GFX9-NEXT: s_subb_u32 s1, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -708,8 +702,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { ; GFX6-LABEL: v_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -731,8 +725,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX8-LABEL: v_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -753,12 +747,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX9-LABEL: v_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 @@ -769,12 +763,10 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX12-LABEL: v_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7] @@ -799,8 +791,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -824,8 +816,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -848,12 +840,12 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -866,12 +858,10 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v2i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7] @@ -898,8 +888,8 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { ; GFX6-LABEL: v_test_sub_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, 0 ; GFX6-NEXT: s_mov_b32 s15, s11 @@ -931,8 +921,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX8-LABEL: v_test_sub_v4i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -971,14 +961,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; ; GFX9-LABEL: v_test_sub_v4i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 @@ -997,12 +987,10 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX12-LABEL: v_test_sub_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v16, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GFX12-NEXT: v_mov_b32_e32 v16, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index fe234a82ba6f7f..6ec213a06999b6 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,13 +8,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -24,8 +24,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -49,13 +49,13 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: v_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -67,10 +67,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX11-LABEL: v_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -98,25 +96,25 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x0 -; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_pk_sub_i16 v0, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -139,23 +137,23 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; GFX10-LABEL: s_test_sub_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -177,7 +175,7 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { ; GCN-LABEL: s_test_sub_self_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -187,7 +185,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX10-LABEL: s_test_sub_self_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 @@ -197,7 +195,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX11-LABEL: s_test_sub_self_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 @@ -216,7 +214,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +227,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -248,7 +246,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -260,7 +258,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -279,7 +277,7 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -293,7 +291,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -312,7 +310,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX10-LABEL: v_test_sub_v2i16_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -326,9 +324,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr ; ; GFX11-LABEL: v_test_sub_v2i16_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -353,7 +349,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -367,7 +363,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -386,7 +382,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX10-LABEL: v_test_sub_v2i16_neg_constant: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -400,9 +396,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ; ; GFX11-LABEL: v_test_sub_v2i16_neg_constant: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -426,7 +420,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -439,7 +433,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -458,7 +452,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -472,9 +466,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p ; ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -498,7 +490,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -511,7 +503,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -529,7 +521,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -543,9 +535,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % ; ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -570,7 +560,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) % define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc @@ -583,7 +573,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -601,7 +591,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc @@ -615,9 +605,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou ; ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc @@ -642,13 +630,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %ou define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -660,8 +648,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -684,13 +672,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -704,10 +692,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -740,14 +726,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -760,8 +746,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -786,13 +772,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -808,10 +794,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -845,13 +829,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -863,8 +847,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -889,13 +873,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 @@ -909,10 +893,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -945,12 +927,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -965,8 +947,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -992,13 +974,13 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 @@ -1015,10 +997,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index d4329aec2021c0..873567c3ab6f4c 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -7,7 +7,7 @@ target triple="amdgcn--" ; NOTE: breaking large PHIs is disabled here else this example is completely optimized out ; before reaching codegen. -define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) nounwind { ; CHECK-LABEL: foobar: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 @@ -59,4 +59,3 @@ ife: declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 19d633651fdd0d..1be420eccb353f 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() #0 { +define amdgpu_kernel void @kern() { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 1 @@ -23,7 +23,5 @@ entry: ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index 2097579e0c9959..acdcd16a1f9efe 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() #0 { +define amdgpu_kernel void @kern() { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 0 @@ -23,7 +23,5 @@ entry: ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 775c62e73261a9..0aac07342db849 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -4,7 +4,7 @@ ; TODO: Update to check for granulated sgpr count directive once one is added. -define amdgpu_kernel void @kern() #0 { +define amdgpu_kernel void @kern() { ; ASM-LABEL: kern: ; ASM: .amdhsa_next_free_sgpr 5 ; ASM: .amdhsa_reserve_xnack_mask 1 @@ -23,7 +23,5 @@ entry: ret void } -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 0daa6860072616..ff019f77413539 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -12,7 +12,7 @@ declare void @llvm.debugtrap() #1 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: trap: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -22,9 +22,9 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX803-LABEL: trap: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 @@ -34,7 +34,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX900-LABEL: trap: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -44,7 +44,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-NOTRAP-GFX900-LABEL: trap: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -54,7 +54,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; ; HSA-TRAP-GFX1100-LABEL: trap: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 ; HSA-TRAP-GFX1100-NEXT: s_mov_b32 ttmp2, m0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) @@ -103,7 +103,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { ; NOHSA-TRAP-GFX900-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -120,7 +120,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-TRAP-GFX803-LABEL: non_entry_trap: ; HSA-TRAP-GFX803: ; %bb.0: ; %entry -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -136,12 +136,12 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX803-NEXT: s_endpgm ; HSA-TRAP-GFX803-NEXT: .LBB1_2: ; %trap -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-GFX803-NEXT: s_trap 2 ; ; HSA-TRAP-GFX900-LABEL: non_entry_trap: ; HSA-TRAP-GFX900: ; %bb.0: ; %entry -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -158,7 +158,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-NOTRAP-GFX900-LABEL: non_entry_trap: ; HSA-NOTRAP-GFX900: ; %bb.0: ; %entry -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -175,7 +175,7 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; ; HSA-TRAP-GFX1100-LABEL: non_entry_trap: ; HSA-TRAP-GFX1100: ; %bb.0: ; %entry -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -266,7 +266,7 @@ ret: define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { ; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -280,8 +280,8 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX803-LABEL: trap_with_use_after: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] -; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 @@ -296,7 +296,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX900-LABEL: trap_with_use_after: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -308,7 +308,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -322,7 +322,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; ; HSA-TRAP-GFX1100-LABEL: trap_with_use_after: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX1100-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc @@ -401,7 +401,7 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-LABEL: debugtrap: ; NOHSA-TRAP-GFX900: ; %bb.0: -; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -414,7 +414,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX803-LABEL: debugtrap: ; HSA-TRAP-GFX803: ; %bb.0: -; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) @@ -429,7 +429,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX900-LABEL: debugtrap: ; HSA-TRAP-GFX900: ; %bb.0: -; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -443,7 +443,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-NOTRAP-GFX900-LABEL: debugtrap: ; HSA-NOTRAP-GFX900: ; %bb.0: -; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2 @@ -456,7 +456,7 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; ; HSA-TRAP-GFX1100-LABEL: debugtrap: ; HSA-TRAP-GFX1100: ; %bb.0: -; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; HSA-TRAP-GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; HSA-TRAP-GFX1100-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 ; HSA-TRAP-GFX1100-NEXT: v_mov_b32_e32 v2, 2 ; HSA-TRAP-GFX1100-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll index 9bab3e6fcf8c45..2f687295af73e0 100644 --- a/llvm/test/CodeGen/AMDGPU/trap.ll +++ b/llvm/test/CodeGen/AMDGPU/trap.ll @@ -31,14 +31,14 @@ declare void @llvm.debugtrap() #1 ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 5080 +; MESA-TRAP-NEXT: .long 208 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 5016 +; NOMESA-TRAP-NEXT: .long 144 ; GCN-LABEL: {{^}}hsa_trap: -; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP: s_trap 2 ; HSA-TRAP: COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 @@ -59,11 +59,11 @@ define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) { ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 5080 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 5016 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-LABEL: {{^}}hsa_debugtrap: ; HSA-TRAP: s_trap 3 @@ -102,7 +102,7 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; NO-TRAP-BIT: enable_trap_handler = 0 ; HSA-TRAP: BB{{[0-9]_[0-9]+}}: ; %trap -; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-NEXT: s_trap 2 define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { entry: @@ -124,7 +124,7 @@ ret: ; NO-TRAP-BIT: enable_trap_handler = 0 ; HSA-TRAP: BB{{[0-9]_[0-9]+}}: ; %trap -; HSA-TRAP: s_mov_b64 s[0:1], s[6:7] +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] ; HSA-TRAP-NEXT: s_trap 2 define amdgpu_kernel void @non_entry_trap_no_unreachable(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index 22eb7dddb84f4d..c0c56ebb166108 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -85,8 +85,8 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) { define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture %arg2) local_unnamed_addr { ; SI-LABEL: truncate_high_elt_extract_vector: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -103,8 +103,8 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc ; ; VI-LABEL: truncate_high_elt_extract_vector: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[4:5], 0x0 ; VI-NEXT: s_load_dword s3, s[6:7], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index efb1a630f927ca..931953e230bb2e 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -5,58 +5,58 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, <16 x i32> %in) { ; SI-LABEL: truncstore_arg_v16i32_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19 -; SI-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s1, s18, 0xff -; SI-NEXT: s_lshl_b32 s0, s19, 24 -; SI-NEXT: s_lshl_b32 s1, s1, 16 -; SI-NEXT: s_or_b32 s0, s0, s1 -; SI-NEXT: s_lshl_b32 s1, s17, 8 -; SI-NEXT: s_and_b32 s2, s16, 0xff -; SI-NEXT: s_or_b32 s1, s2, s1 -; SI-NEXT: s_and_b32 s1, s1, 0xffff -; SI-NEXT: s_and_b32 s2, s14, 0xff -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_lshl_b32 s1, s15, 24 -; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_or_b32 s1, s1, s2 -; SI-NEXT: s_lshl_b32 s2, s13, 8 -; SI-NEXT: s_and_b32 s3, s12, 0xff -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_and_b32 s2, s2, 0xffff -; SI-NEXT: s_and_b32 s3, s10, 0xff -; SI-NEXT: s_or_b32 s1, s2, s1 -; SI-NEXT: s_lshl_b32 s2, s11, 24 -; SI-NEXT: s_lshl_b32 s3, s3, 16 -; SI-NEXT: s_or_b32 s2, s2, s3 -; SI-NEXT: s_lshl_b32 s3, s9, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_or_b32 s3, s8, s3 -; SI-NEXT: s_and_b32 s3, s3, 0xffff ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_lshl_b32 s3, s7, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_or_b32 s3, s3, s6 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s3, s4, s3 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_mov_b32_e32 v3, s0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: truncstore_arg_v16i32_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s17 ; VI-NEXT: v_mov_b32_e32 v1, s16 @@ -98,9 +98,9 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, <16 x i64> %in) { ; SI-LABEL: truncstore_arg_v16i64_to_v16i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x39 -; SI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x29 +; SI-NEXT: s_load_dwordx16 s[16:31], s[0:1], 0x39 +; SI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x29 ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s38, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,9 +149,9 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, ; ; VI-LABEL: truncstore_arg_v16i64_to_v16i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0xe4 -; VI-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[16:31], s[0:1], 0xe4 +; VI-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0xa4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v0, 8, s26 ; VI-NEXT: v_mov_b32_e32 v1, s24 diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index 88bdf6454fe522..a9cd0e997e0e59 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) { ; GCN-LABEL: {{^}}trunc_i64_to_i32_store: -; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[2:3], +; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] ; SI: buffer_store_dword [[VLOAD]] ; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]] diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 03a1b3598024b4..416dbb226422cc 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -6,8 +6,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; VI-LABEL: s_uaddo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s6, s0 @@ -46,14 +46,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; GFX9-LABEL: s_uaddo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_add_u32 s0, s6, s0 +; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_addc_u32 s1, s7, s1 +; GFX9-NEXT: s_addc_u32 s1, s7, s3 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -75,8 +75,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -95,8 +95,8 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -111,12 +111,12 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 @@ -345,7 +345,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s6, s4, s6 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_uaddo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -508,7 +508,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_uaddo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -537,7 +537,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_uaddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -568,7 +568,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_uaddo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_uaddo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -618,45 +618,45 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; VI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -668,19 +668,19 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_cmp_eq_u32 s2, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: global_store_byte v1, v2, s[6:7] @@ -706,7 +706,7 @@ exit: define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -740,7 +740,7 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -767,7 +767,7 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index dfd9a650ff0e96..f686aad0cefc25 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; VI-LABEL: udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -80,7 +80,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -112,7 +112,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX1030-LABEL: udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] @@ -185,7 +185,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -218,7 +218,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -282,7 +282,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 @@ -346,7 +346,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -401,7 +401,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -507,7 +507,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v2i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] @@ -619,7 +619,7 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s6, s10 @@ -714,7 +714,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: udiv_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s6, s10 @@ -809,7 +809,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -904,7 +904,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: udiv_v4i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 @@ -1098,7 +1098,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_pow2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: udiv_i32_div_pow2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1134,7 +1134,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1148,7 +1148,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; ; GFX1030-LABEL: udiv_i32_div_pow2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1183,7 +1183,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_even: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1203,7 +1203,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; VI-LABEL: udiv_i32_div_k_even: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1239,7 +1239,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; ; GFX1030-LABEL: udiv_i32_div_k_even: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1277,7 +1277,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: udiv_i32_div_k_odd: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1297,7 +1297,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; VI-LABEL: udiv_i32_div_k_odd: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1317,7 +1317,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1333,7 +1333,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; ; GFX1030-LABEL: udiv_i32_div_k_odd: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; VI-LABEL: v_udiv_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1429,7 +1429,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1452,7 +1452,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX1030-LABEL: v_udiv_i8: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] @@ -1511,7 +1511,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1540,7 +1540,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1569,7 +1569,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i16: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] @@ -1651,7 +1651,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i23: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1688,7 +1688,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i23: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1725,7 +1725,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1770,7 +1770,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i23: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -1885,7 +1885,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -1922,7 +1922,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1967,7 +1967,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; GFX1030-LABEL: v_udiv_i24: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x3 @@ -2048,7 +2048,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { ; SI-LABEL: scalarize_mulhu_4xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,7 +2076,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; VI-LABEL: scalarize_mulhu_4xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2104,7 +2104,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2130,7 +2130,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; ; GFX1030-LABEL: scalarize_mulhu_4xi32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] @@ -2193,7 +2193,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read define amdgpu_kernel void @test_udiv2(i32 %p) { ; SI-LABEL: test_udiv2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; VI-LABEL: test_udiv2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2227,7 +2227,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; ; GFX1030-LABEL: test_udiv2: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 @@ -2253,7 +2253,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; SI-LABEL: test_udiv_3_mulhu: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -2266,7 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; VI-LABEL: test_udiv_3_mulhu: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -2279,7 +2279,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GCN-LABEL: test_udiv_3_mulhu: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[6:7], 0x0 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -2290,7 +2290,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; ; GFX1030-LABEL: test_udiv_3_mulhu: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 78f85569f849d7..84906ac1f27ba9 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -123,8 +123,8 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -398,8 +398,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -497,17 +497,17 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 @@ -533,17 +533,17 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s8, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s4, s0 @@ -576,18 +576,18 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -614,18 +614,18 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -659,8 +659,8 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -684,8 +684,8 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -716,14 +716,12 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s0, 0xff000000 -; GCN-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NEXT: s_and_b32 s0, s2, 0xff000000 +; GCN-NEXT: s_and_b32 s1, s3, 0xffff ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 @@ -734,18 +732,20 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_sub_u32 s8, 0, s0 ; GCN-NEXT: s_subb_u32 s9, 0, s1 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 ; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 @@ -831,20 +831,20 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s5, 0xffff -; GCN-IR-NEXT: s_and_b32 s0, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 ; GCN-IR-NEXT: s_and_b32 s5, s7, 0xffff ; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[0:1], 24 -; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[4:5], 24 +; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 +; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[0:1] +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[2:3] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] ; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[8:9] ; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 @@ -869,8 +869,8 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s14 -; GCN-IR-NEXT: s_add_u32 s14, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s15, s1, -1 +; GCN-IR-NEXT: s_add_u32 s14, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 ; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] ; GCN-IR-NEXT: s_add_u32 s8, s4, s16 ; GCN-IR-NEXT: s_addc_u32 s9, s5, 0 @@ -888,7 +888,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s4, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[0:1] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 @@ -898,10 +898,10 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow4 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[0:1] +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 @@ -920,7 +920,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -1364,7 +1364,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_add_u32 s1, 0, 0xaaaa0000 ; GCN-NEXT: v_not_b32_e32 v0, 23 ; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 @@ -1443,7 +1443,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_udiv_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1661,7 +1661,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 8 @@ -1682,7 +1682,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 @@ -1709,7 +1709,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1731,7 +1731,7 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_udiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index 1468c7b99b5c25..f0f0b6680e0e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -36,22 +36,22 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x26 -; GFX6-NEXT: s_load_dword s9, s[2:3], 0x1d +; GFX6-NEXT: s_load_dword s8, s[0:1], 0x26 +; GFX6-NEXT: s_load_dword s9, s[0:1], 0x1d +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: s_mov_b32 s3, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, s6 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_readfirstlane_b32 s10, v0 @@ -69,34 +69,33 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: s_cselect_b32 s8, s10, s9 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x98 -; GFX8-NEXT: s_load_dword s5, s[2:3], 0x74 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x98 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX8-NEXT: s_sub_i32 s0, 0, s4 +; GFX8-NEXT: s_sub_i32 s2, 0, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 ; GFX8-NEXT: s_mul_i32 s0, s0, s4 ; GFX8-NEXT: s_sub_i32 s0, s5, s0 @@ -164,33 +163,33 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s6 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s6 -; GFX6-NEXT: s_cmp_ge_u32 s0, s6 -; GFX6-NEXT: s_cselect_b32 s4, s1, s0 -; GFX6-NEXT: s_sub_i32 s0, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s6 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s7 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 @@ -207,46 +206,44 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX6-NEXT: s_cselect_b32 s5, s6, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: s_sub_i32 s0, 0, s6 +; GFX8-NEXT: s_sub_i32 s2, 0, s6 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s6 -; GFX8-NEXT: s_sub_i32 s0, s4, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s6 -; GFX8-NEXT: s_cmp_ge_u32 s0, s6 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s6 -; GFX8-NEXT: s_cmp_ge_u32 s0, s6 -; GFX8-NEXT: s_cselect_b32 s4, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s7 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_mul_i32 s2, s2, s6 +; GFX8-NEXT: s_sub_i32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, 0, s7 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: s_mul_i32 s2, s2, s7 ; GFX8-NEXT: s_sub_i32 s2, s5, s2 @@ -335,36 +332,34 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: s_mul_i32 s0, s0, s8 -; GFX6-NEXT: s_sub_i32 s0, s4, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s8 -; GFX6-NEXT: s_cmp_ge_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -372,82 +367,87 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: v_readfirstlane_b32 s1, v0 -; GFX6-NEXT: s_mul_i32 s1, s1, s9 -; GFX6-NEXT: s_sub_i32 s1, s5, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, s1, s9 -; GFX6-NEXT: s_cmp_ge_u32 s1, s9 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s5, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: s_mul_i32 s4, s4, s10 -; GFX6-NEXT: s_sub_i32 s4, s6, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, s4, s10 -; GFX6-NEXT: s_cmp_ge_u32 s4, s10 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 -; GFX6-NEXT: s_sub_i32 s5, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s10 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s6, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: s_mul_i32 s0, s0, s11 -; GFX6-NEXT: s_sub_i32 s0, s7, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s11 -; GFX6-NEXT: s_cmp_ge_u32 s0, s11 -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v4: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x34 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: s_sub_i32 s2, 0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s8 -; GFX8-NEXT: s_sub_i32 s0, s4, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s8 -; GFX8-NEXT: s_cmp_ge_u32 s0, s8 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s8 -; GFX8-NEXT: s_cmp_ge_u32 s0, s8 -; GFX8-NEXT: s_cselect_b32 s4, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s9 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: s_mul_i32 s2, s2, s8 +; GFX8-NEXT: s_sub_i32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s8 +; GFX8-NEXT: s_cmp_ge_u32 s2, s8 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, s2, s8 +; GFX8-NEXT: s_cmp_ge_u32 s2, s8 +; GFX8-NEXT: s_cselect_b32 s2, s3, s2 +; GFX8-NEXT: s_sub_i32 s3, 0, s9 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 @@ -455,44 +455,40 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s9 -; GFX8-NEXT: s_sub_i32 s0, s5, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s9 -; GFX8-NEXT: s_cmp_ge_u32 s0, s9 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s9 -; GFX8-NEXT: s_cmp_ge_u32 s0, s9 -; GFX8-NEXT: s_cselect_b32 s5, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s10 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: s_mul_i32 s3, s3, s9 +; GFX8-NEXT: s_sub_i32 s3, s5, s3 +; GFX8-NEXT: s_sub_i32 s4, s3, s9 +; GFX8-NEXT: s_cmp_ge_u32 s3, s9 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s4, s3, s9 +; GFX8-NEXT: s_cmp_ge_u32 s3, s9 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s4, 0, s10 +; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: s_mul_i32 s0, s0, s10 -; GFX8-NEXT: s_sub_i32 s0, s6, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s10 -; GFX8-NEXT: s_cmp_ge_u32 s0, s10 -; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s10 -; GFX8-NEXT: s_cmp_ge_u32 s0, s10 -; GFX8-NEXT: s_cselect_b32 s6, s1, s0 -; GFX8-NEXT: s_sub_i32 s0, 0, s11 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v1 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_mul_i32 s4, s4, s10 +; GFX8-NEXT: s_sub_i32 s4, s6, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, s10 +; GFX8-NEXT: s_cmp_ge_u32 s4, s10 +; GFX8-NEXT: s_cselect_b32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, s10 +; GFX8-NEXT: s_cmp_ge_u32 s4, s10 +; GFX8-NEXT: s_cselect_b32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s5, 0, s11 +; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_readfirstlane_b32 s2, v3 ; GFX8-NEXT: s_mul_i32 s2, s2, s11 ; GFX8-NEXT: s_sub_i32 s2, s7, s2 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index d00ea6dff24474..ba52d702c7ed11 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -26,7 +26,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; ; VI-LABEL: v_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -66,7 +66,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -84,8 +84,8 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -102,12 +102,12 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 ; VI-NEXT: v_ldexp_f64 v[4:5], v[2:3], 32 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -126,8 +126,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) { ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s11 ; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s9 @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x20 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s15 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s13 @@ -194,8 +194,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_uint_to_fp_i32_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -205,8 +205,8 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; ; VI-LABEL: s_uint_to_fp_i32_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -221,7 +221,7 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -237,8 +237,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) { ; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -257,8 +257,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; ; VI-LABEL: s_uint_to_fp_v4i32_to_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -284,8 +284,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -298,8 +298,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; ; VI-LABEL: uint_to_fp_i1_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -318,8 +318,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) { ; SI-LABEL: uint_to_fp_i1_to_f64_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -332,8 +332,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; ; VI-LABEL: uint_to_fp_i1_to_f64_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -351,8 +351,8 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { ; SI-LABEL: s_uint_to_fp_i8_to_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -363,8 +363,8 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; ; VI-LABEL: s_uint_to_fp_i8_to_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -400,8 +400,8 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -414,8 +414,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -451,8 +451,8 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -465,8 +465,8 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; ; VI-LABEL: s_select_uint_to_fp_i1_vals_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 @@ -503,8 +503,8 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[6:7], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; SI-NEXT: s_load_dword s2, s[4:5], 0x2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 @@ -517,8 +517,8 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; ; VI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[6:7], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; VI-NEXT: s_load_dword s2, s[4:5], 0x8 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 3d0fc4e6281a6b..79b0a966bc1fbd 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -75,7 +75,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -126,27 +126,26 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -162,7 +161,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 { ; GFX6-LABEL: s_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -181,7 +180,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX8-LABEL: s_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 ; GFX8-NEXT: s_min_u32 s4, s4, 32 @@ -198,7 +197,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i ; ; GFX11-LABEL: s_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s4, s3 @@ -225,7 +224,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_i64_to_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 3, v0 @@ -249,7 +248,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX8-LABEL: v_uint_to_fp_i64_to_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -274,26 +273,24 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad ; ; GFX11-LABEL: v_uint_to_fp_i64_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v3, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -309,8 +306,8 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -335,8 +332,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s2, s7 ; GFX8-NEXT: s_flbit_i32_b32 s3, s5 @@ -362,8 +359,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s2, s7 @@ -395,7 +392,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -446,7 +443,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -499,53 +496,51 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 -; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 -; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 +; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v8 -; GFX11-NEXT: v_ldexp_f32 v3, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v2, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v0, v5, v4 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v2, v1, v10 +; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 +; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 +; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -561,8 +556,8 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{ ; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -591,8 +586,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; ; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s2, s7 ; GFX8-NEXT: s_flbit_i32_b32 s3, s5 @@ -621,8 +616,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 ; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s2, s7 @@ -659,7 +654,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 5, v0 @@ -718,7 +713,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -777,61 +772,59 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; ; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_clz_i32_u32_e32 v9, v3 -; GFX11-NEXT: v_clz_i32_u32_e32 v10, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4 +; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v11, v7 -; GFX11-NEXT: v_clz_i32_u32_e32 v12, v5 +; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8 +; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6 ; GFX11-NEXT: v_min_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_min_u32_e32 v10, 32, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX11-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[2:3], v9, v[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4] +; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v11, v[6:7] -; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8] +; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9 ; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10 -; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX11-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX11-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX11-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX11-NEXT: v_min_u32_e32 v1, 1, v1 +; GFX11-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX11-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11 -; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-NEXT: v_or_b32_e32 v3, v5, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 32, v12 -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v8, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12 ; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v8 -; GFX11-NEXT: v_ldexp_f32 v2, v2, v9 -; GFX11-NEXT: v_ldexp_f32 v0, v0, v10 -; GFX11-NEXT: v_ldexp_f32 v1, v1, v11 -; GFX11-NEXT: v_ldexp_f32 v3, v3, v4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_ldexp_f32 v3, v3, v9 +; GFX11-NEXT: v_ldexp_f32 v1, v1, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GFX11-NEXT: v_ldexp_f32 v2, v2, v11 +; GFX11-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pack_b32_f16 v1, v0, v2 -; GFX11-NEXT: v_pack_b32_f16 v0, v3, v4 +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index c21ae434f44709..5f8d0f665a953d 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -25,7 +25,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; VI-LABEL: uitofp_i16_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -43,7 +43,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; ; GFX11-LABEL: uitofp_i16_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -72,7 +72,7 @@ entry: define amdgpu_kernel void @uitofp_i32_to_f16( ; SI-LABEL: uitofp_i32_to_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -91,7 +91,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; VI-LABEL: uitofp_i32_to_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -110,7 +110,7 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; ; GFX11-LABEL: uitofp_i32_to_f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -143,7 +143,7 @@ entry: define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; SI-LABEL: uitofp_v2i16_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -168,7 +168,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; VI-LABEL: uitofp_v2i16_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -188,7 +188,7 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; ; GFX11-LABEL: uitofp_v2i16_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -221,7 +221,7 @@ entry: define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; SI-LABEL: uitofp_v2i32_to_v2f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -244,7 +244,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; VI-LABEL: uitofp_v2i32_to_v2f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s10, s6 @@ -266,7 +266,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; ; GFX11-LABEL: uitofp_v2i32_to_v2f16: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s10, s6 @@ -301,21 +301,19 @@ entry: define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_uint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -323,26 +321,26 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -350,14 +348,16 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_uint_to_fp_i1_to_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, s10 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index a3fc6ded0a0047..f60a274f1e592b 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -5,36 +5,36 @@ define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, 0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_cmp_eq_u32 s2, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_cbranch_scc1 .LBB0_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB0_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s0, 0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_cmp_eq_u32 s2, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB0_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB0_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -57,38 +57,38 @@ done: define amdgpu_kernel void @uniform_if_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 +; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 ; SI-NEXT: s_and_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB1_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s1, s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_load_dword s3, s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s1, 0 +; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], s3, 0 ; VI-NEXT: s_and_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB1_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB1_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -111,36 +111,36 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_scc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_cbranch_scc1 .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB2_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_swap_br_targets_scc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB2_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB2_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -163,38 +163,38 @@ done: define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_swap_br_targets_vcc: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s1, s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_load_dword s3, s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 +; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 ; SI-NEXT: s_and_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB3_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_swap_br_targets_vcc: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s1, s[2:3], 0x24 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_load_dword s3, s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s1, 0 +; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], s3, 0 ; VI-NEXT: s_and_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB3_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB3_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -219,14 +219,14 @@ done: define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e32 v0, s0, v0 +; SI-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -237,14 +237,14 @@ define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) ; ; VI-LABEL: uniform_if_move_valu: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0 ; VI-NEXT: s_cbranch_vccnz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -271,14 +271,14 @@ endif: define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, float %a) { ; SI-LABEL: uniform_if_move_valu_commute: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_add_f32_e32 v0, s0, v0 +; SI-NEXT: v_add_f32_e32 v0, s2, v0 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; SI-NEXT: s_cbranch_vccnz .LBB5_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -289,14 +289,14 @@ define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, f ; ; VI-LABEL: uniform_if_move_valu_commute: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-NEXT: v_add_f32_e32 v0, s2, v0 ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0 ; VI-NEXT: s_cbranch_vccnz .LBB5_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -322,36 +322,38 @@ endif: define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out, i32 %a) { ; SI-LABEL: uniform_if_else_ret: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB6_2 ; SI-NEXT: ; %bb.1: ; %if.else +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB6_2: ; %if.then +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: uniform_if_else_ret: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB6_2 ; VI-NEXT: ; %bb.1: ; %if.else +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB6_2: ; %if.then +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -374,8 +376,8 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr addrspace(1) nocapture %out1, i32 %a) { ; SI-LABEL: uniform_if_else: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[2:3], 0xd -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -401,8 +403,8 @@ define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr ; ; VI-LABEL: uniform_if_else: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[2:3], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -444,17 +446,17 @@ if.end: ; preds = %if.else, %if.then define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: icmp_2_users: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dword s4, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_gt_i32 s4, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: s_cmp_lt_i32 s4, 1 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %IF -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: .LBB8_2: ; %ENDIF @@ -462,17 +464,17 @@ define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) { ; ; VI-LABEL: icmp_2_users: ; VI: ; %bb.0: ; %main_body -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_gt_i32 s4, 0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_cmp_lt_i32 s4, 1 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %IF -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: .LBB8_2: ; %ENDIF @@ -493,20 +495,20 @@ ENDIF: ; preds = %IF, %main_body define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, ptr addrspace(1) %out) { ; SI-LABEL: icmp_users_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s0, 1 +; SI-NEXT: s_cmp_lt_i32 s2, 1 ; SI-NEXT: s_cbranch_scc1 .LBB9_2 ; SI-NEXT: ; %bb.1: ; %bb2 -; SI-NEXT: s_cmp_gt_i32 s1, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-NEXT: s_cmp_gt_i32 s3, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %bb9 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB9_3: ; %bb7 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -516,20 +518,20 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p ; ; VI-LABEL: icmp_users_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lt_i32 s0, 1 +; VI-NEXT: s_cmp_lt_i32 s2, 1 ; VI-NEXT: s_cbranch_scc1 .LBB9_2 ; VI-NEXT: ; %bb.1: ; %bb2 -; VI-NEXT: s_cmp_gt_i32 s1, 0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[0:1] +; VI-NEXT: s_cmp_gt_i32 s3, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB9_3 ; VI-NEXT: .LBB9_2: ; %bb9 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB9_3: ; %bb7 -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -558,7 +560,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: uniform_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: .LBB10_1: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -570,7 +572,7 @@ define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: uniform_loop: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: .LBB10_1: ; %loop ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -598,11 +600,11 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; SI-LABEL: uniform_inside_divergent: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB11_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0 @@ -621,11 +623,11 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; VI-LABEL: uniform_inside_divergent: ; VI: ; %bb.0: ; %entry ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; VI-NEXT: s_cbranch_execz .LBB11_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0 @@ -660,14 +662,14 @@ endif: define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_inside_uniform: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB12_2 ; SI-NEXT: .LBB12_1: ; %endif ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB12_2: ; %if -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -683,14 +685,14 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 % ; ; VI-LABEL: divergent_inside_uniform: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB12_2 ; VI-NEXT: .LBB12_1: ; %endif ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB12_2: ; %if -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -724,9 +726,9 @@ endif: define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %cond) { ; SI-LABEL: divergent_if_uniform_if: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_cbranch_execz .LBB13_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -735,8 +737,8 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: .LBB13_2: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_load_dword s0, s[2:3], 0xb +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_load_dword s0, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 @@ -752,9 +754,9 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; ; VI-LABEL: divergent_if_uniform_if: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; VI-NEXT: s_cbranch_execz .LBB13_2 ; VI-NEXT: ; %bb.1: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 @@ -763,8 +765,8 @@ define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: .LBB13_2: ; %endif -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc0 .LBB13_4 @@ -805,12 +807,12 @@ exit: define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr addrspace(1) %out) { ; SI-LABEL: cse_uniform_condition_different_blocks: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lt_i32 s0, 1 +; SI-NEXT: s_cmp_lt_i32 s2, 1 ; SI-NEXT: s_cbranch_scc1 .LBB14_2 ; SI-NEXT: ; %bb.1: ; %bb2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -825,12 +827,12 @@ define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr ; ; VI-LABEL: cse_uniform_condition_different_blocks: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lt_i32 s0, 1 +; VI-NEXT: s_cmp_lt_i32 s2, 1 ; VI-NEXT: s_cbranch_scc1 .LBB14_2 ; VI-NEXT: ; %bb.1: ; %bb2 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -865,7 +867,7 @@ bb9: ; preds = %bb8, %bb4 define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_eq: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_eq_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -884,7 +886,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_eq: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -919,7 +921,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_ne: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -938,7 +940,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %ou ; ; VI-LABEL: uniform_if_scc_i64_ne: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -973,7 +975,7 @@ done: define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %out) { ; SI-LABEL: uniform_if_scc_i64_sgt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; SI-NEXT: s_mov_b32 s0, 0 @@ -992,7 +994,7 @@ define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %o ; ; VI-LABEL: uniform_if_scc_i64_sgt: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0 ; VI-NEXT: s_mov_b32 s0, 0 @@ -1029,17 +1031,17 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: s_cbranch_vccnz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB18_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1048,17 +1050,17 @@ define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_read_b64 v[0:1], v0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: s_cbranch_vccnz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB18_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1083,17 +1085,17 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 -; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: s_cbranch_vccnz .LBB19_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s0, 1 +; SI-NEXT: s_mov_b32 s2, 1 ; SI-NEXT: .LBB19_2: ; %done -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1102,17 +1104,17 @@ define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_read_b64 v[0:1], v0 -; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: s_cbranch_vccnz .LBB19_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_mov_b32 s0, 1 +; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: .LBB19_2: ; %done -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index 18b2397bbd5a7e..0cb408676552e1 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-LABEL: test_insert_extract: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_mov_b32 s2, 0 ; GFX90A-NEXT: s_and_b64 vcc, exec, -1 ; GFX90A-NEXT: s_mov_b32 s3, 0 @@ -55,7 +55,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX940-LABEL: test_insert_extract: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX940-NEXT: s_mov_b32 s2, 0 ; GFX940-NEXT: s_and_b64 vcc, exec, -1 ; GFX940-NEXT: s_mov_b32 s3, 0 @@ -103,7 +103,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1030-LABEL: test_insert_extract: ; GFX1030: ; %bb.0: ; %entry -; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX1030-NEXT: s_mov_b32 s2, 0 ; GFX1030-NEXT: s_mov_b32 s3, 0 ; GFX1030-NEXT: s_mov_b32 s4, 0 @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX1100-LABEL: test_insert_extract: ; GFX1100: ; %bb.0: ; %entry -; GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1100-NEXT: s_mov_b32 s2, 0 ; GFX1100-NEXT: s_mov_b32 s3, 0 ; GFX1100-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 63105453174ebe..c0c84d46b7356b 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -122,8 +122,8 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -413,18 +413,18 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem31_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: s_sub_i32 s0, 0, s8 +; GCN-NEXT: s_sub_i32 s2, 0, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s2, s3, 1 @@ -448,18 +448,18 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem31_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s0, s[2:3], 0xe +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s8, s0, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s2, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-IR-NEXT: s_sub_i32 s0, 0, s8 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v0 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v0 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 @@ -490,112 +490,110 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, 0, s0 -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s8, s7, 1 +; GCN-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 0, s2 +; GCN-NEXT: s_lshr_b32 s4, s11, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-NEXT: s_lshr_b32 s5, s7, 1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s5, s5, s0 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-NEXT: s_sub_i32 s4, 0, s1 -; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: s_mul_i32 s6, s6, s2 +; GCN-NEXT: s_sub_i32 s3, s3, s6 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-NEXT: s_sub_i32 s2, 0, s4 +; GCN-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_mul_i32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_mul_i32 s6, s6, s4 +; GCN-NEXT: s_sub_i32 s5, s5, s6 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem31_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s7, 1 +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s3, 0, s2 +; GCN-IR-NEXT: s_lshr_b32 s4, s11, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-IR-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s5, s7, 1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s2 +; GCN-IR-NEXT: s_sub_i32 s3, s3, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s5, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -607,8 +605,8 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem24_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_lshr_b32 s4, s4, 8 @@ -632,8 +630,8 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 ; ; GCN-IR-LABEL: s_test_urem24_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[2:3], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_lshr_b32 s4, s4, 8 @@ -664,112 +662,110 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, 0, s0 -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s8, s7, 9 +; GCN-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 0, s2 +; GCN-NEXT: s_lshr_b32 s4, s11, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-NEXT: s_lshr_b32 s5, s7, 9 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s5, s5, s0 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-NEXT: s_sub_i32 s5, s4, s0 -; GCN-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-NEXT: s_sub_i32 s4, 0, s1 -; GCN-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: s_mul_i32 s6, s6, s2 +; GCN-NEXT: s_sub_i32 s3, s3, s6 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-NEXT: s_sub_i32 s6, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-NEXT: s_sub_i32 s2, 0, s4 +; GCN-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: s_mul_i32 s0, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s8, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s1 -; GCN-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_mul_i32 s6, s6, s4 +; GCN-NEXT: s_sub_i32 s5, s5, s6 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-NEXT: s_sub_i32 s6, s5, s4 +; GCN-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0xd +; GCN-IR-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_lshr_b32 s0, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_sub_i32 s1, 0, s0 -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s8, s7, 9 +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_sub_i32 s3, 0, s2 +; GCN-IR-NEXT: s_lshr_b32 s4, s11, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s1, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-IR-NEXT: s_lshr_b32 s3, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s5, s7, 9 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-IR-NEXT: s_mul_i32 s5, s5, s0 -; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s4, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s5, s4, s0 -; GCN-IR-NEXT: s_cmp_ge_u32 s4, s0 -; GCN-IR-NEXT: s_cselect_b32 s0, s5, s4 -; GCN-IR-NEXT: s_sub_i32 s4, 0, s1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v1 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s2 +; GCN-IR-NEXT: s_sub_i32 s3, s3, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s3, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s6, s3, s2 +; GCN-IR-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-IR-NEXT: s_cselect_b32 s6, s6, s3 +; GCN-IR-NEXT: s_sub_i32 s2, 0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v1 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s5, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-IR-NEXT: s_mul_i32 s0, s0, s1 -; GCN-IR-NEXT: s_sub_i32 s0, s8, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: s_sub_i32 s2, s0, s1 -; GCN-IR-NEXT: s_cmp_ge_u32 s0, s1 -; GCN-IR-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s4 +; GCN-IR-NEXT: s_sub_i32 s5, s5, s6 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s5, s6, s5 +; GCN-IR-NEXT: s_sub_i32 s6, s5, s4 +; GCN-IR-NEXT: s_cmp_ge_u32 s5, s4 +; GCN-IR-NEXT: s_cselect_b32 s4, s6, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s4 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -781,7 +777,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -885,7 +881,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] @@ -965,6 +961,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_den_i64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_add_u32 s0, 0, 0xaaaa0000 ; GCN-NEXT: v_not_b32_e32 v0, 23 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -980,7 +977,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v1, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 ; GCN-NEXT: s_mul_i32 s8, s1, s8 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_hi_u32 v4, s1, v0 @@ -1003,8 +1000,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1013,7 +1010,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 -; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_mov_b32_e32 v2, s7 @@ -1042,7 +1038,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 @@ -1393,7 +1389,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s2, -1 @@ -1416,7 +1412,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s2, -1 @@ -1445,7 +1441,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -1470,7 +1466,7 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x ; ; GCN-IR-LABEL: s_test_urem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_movk_i32 s4, 0x5b7f ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index dacc9862059831..666ae7c126ae3e 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64_zext: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; VI-LABEL: s_usubo_i64_zext: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_sub_u32 s0, s6, s0 @@ -47,14 +47,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; ; GFX9-LABEL: s_usubo_i64_zext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_sub_u32 s0, s6, s0 +; GFX9-NEXT: s_sub_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_subb_u32 s1, s7, s1 +; GFX9-NEXT: s_subb_u32 s1, s7, s3 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -75,8 +75,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 @@ -95,8 +95,8 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -111,12 +111,12 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: global_store_byte v0, v2, s[6:7] @@ -132,7 +132,7 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -161,7 +161,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -182,7 +182,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -210,7 +210,7 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i32_novcc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -243,7 +243,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_i32_novcc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -268,7 +268,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_i32_novcc: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] @@ -301,7 +301,7 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -325,7 +325,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: s_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 @@ -345,7 +345,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: s_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s6, s4, s6 @@ -370,7 +370,7 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -401,7 +401,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -424,7 +424,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -486,7 +486,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; VI-LABEL: v_usubo_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -508,7 +508,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX9-LABEL: v_usubo_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] @@ -537,7 +537,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_usubo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 @@ -568,7 +568,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; VI-LABEL: v_usubo_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -591,7 +591,7 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; ; GFX9-LABEL: v_usubo_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] @@ -618,45 +618,45 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { ; SI-LABEL: s_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: s_cmp_eq_u32 s0, s1 -; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_cmp_eq_u32 s0, s1 -; VI-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; VI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; VI-NEXT: .LBB8_2: ; %exit -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -668,19 +668,19 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: s_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_cmp_eq_u32 s2, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: .LBB8_2: ; %exit -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: global_store_byte v1, v2, s[6:7] @@ -707,7 +707,7 @@ exit: define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s14, s2 @@ -741,7 +741,7 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; VI-LABEL: v_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 @@ -768,7 +768,7 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; ; GFX9-LABEL: v_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll index 2210b6c0d3c3a4..ca4d689156b491 100644 --- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll @@ -25,7 +25,7 @@ bb: define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_add_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll index 4b9b5f9ffdf84f..2fa9750653b6d2 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll @@ -23,7 +23,7 @@ entry: define amdgpu_kernel void @fcmp_test(half %x, half %y) { ; CHECK-LABEL: fcmp_test: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 @@ -46,7 +46,7 @@ entry: define amdgpu_kernel void @ballot_test(half %x, half %y) { ; CHECK-LABEL: ballot_test: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_b32 s0, s[2:3], 0x0 +; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s1, s0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index a8f3635416cffa..fc6df735c05b0f 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -13,37 +13,37 @@ declare double @llvm.fabs.f64(double) define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cnd_nan_nosgpr: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_eq_u32 s8, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cnd_nan_nosgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, 0 +; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc @@ -54,37 +54,35 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; ; GFX10-LABEL: v_cnd_nan_nosgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan_nosgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc @@ -109,7 +107,7 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { ; SI-LABEL: v_cnd_nan: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -124,7 +122,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; VI-LABEL: v_cnd_nan: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -137,7 +135,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s2, 0 @@ -148,7 +146,7 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; GFX11-LABEL: v_cnd_nan: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 @@ -171,30 +169,30 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -202,27 +200,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -238,30 +235,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -269,25 +266,24 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -305,30 +301,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x4c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -336,27 +332,26 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s0, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[2:3] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1] ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x4c -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -372,30 +367,30 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -403,25 +398,24 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 @@ -439,16 +433,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -457,20 +451,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -478,34 +472,32 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -526,16 +518,16 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 -; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -544,20 +536,20 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; VI-NEXT: s_load_dword s2, s[2:3], 0x2c +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 +; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -565,34 +557,32 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 +; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -613,8 +603,8 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -632,8 +622,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dword s0, s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -652,10 +642,9 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -666,12 +655,9 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o ; ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -695,8 +681,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -716,8 +702,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -741,13 +727,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc @@ -757,10 +743,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -789,8 +773,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -810,8 +794,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -835,13 +819,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc @@ -851,10 +835,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -883,8 +865,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -905,8 +887,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -931,13 +913,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc @@ -948,10 +930,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc @@ -981,8 +961,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1007,8 +987,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1036,14 +1016,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1056,10 +1036,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1092,8 +1070,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1118,8 +1096,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1147,14 +1125,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1167,10 +1145,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1205,8 +1181,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1231,8 +1207,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1260,14 +1236,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc @@ -1280,10 +1256,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1316,8 +1290,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 @@ -1341,8 +1315,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1369,13 +1343,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 @@ -1388,10 +1362,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc @@ -1426,8 +1398,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1451,8 +1423,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1479,14 +1451,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc @@ -1497,10 +1469,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1532,8 +1502,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 @@ -1556,8 +1526,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,14 +1553,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -1601,10 +1571,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,8 +1604,8 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1657,8 +1625,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1682,13 +1650,13 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc @@ -1698,10 +1666,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1731,8 +1697,8 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -1756,8 +1722,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1785,13 +1751,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc @@ -1805,10 +1771,8 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc @@ -1844,18 +1808,18 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 @@ -1863,22 +1827,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1891,15 +1855,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1907,23 +1871,21 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX10-NEXT: global_store_short v2, v0, s[0:1] +; GFX10-NEXT: global_store_short v2, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 @@ -1948,37 +1910,37 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dword s8, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] @@ -1989,37 +1951,35 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[0:1] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] @@ -2041,18 +2001,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { ; SI-LABEL: v_cndmask_abs_neg_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 @@ -2060,22 +2020,22 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cndmask_abs_neg_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 @@ -2089,15 +2049,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; ; GFX10-LABEL: v_cndmask_abs_neg_f64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 @@ -2106,23 +2066,21 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cndmask_abs_neg_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 472a443cf6dde7..f7933d719f9893 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -6,57 +6,57 @@ define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 -; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ; GFX11-NEXT: s_mov_b32 s14, s10 @@ -95,32 +95,32 @@ entry: define amdgpu_kernel void @madak_f16_use_2( ; SI-LABEL: madak_f16_use_2: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s8 ; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s14 -; SI-NEXT: s_mov_b32 s11, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -128,49 +128,49 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: v_mac_f32_e32 v3, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 -; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 -; VI-NEXT: s_mov_b32 s15, 0xf000 -; VI-NEXT: s_mov_b32 s14, -1 -; VI-NEXT: s_mov_b32 s18, s14 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s8 ; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s19, s15 +; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s14 -; VI-NEXT: s_mov_b32 s11, s15 -; VI-NEXT: s_mov_b32 s2, s14 -; VI-NEXT: s_mov_b32 s3, s15 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 glc +; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, 0x4900 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 ; VI-NEXT: v_mac_f16_e32 v3, v0, v2 -; VI-NEXT: buffer_store_short v1, off, s[12:15], 0 -; VI-NEXT: buffer_store_short v3, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: madak_f16_use_2: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 ; GFX11-NEXT: s_mov_b32 s14, -1 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000 ; GFX11-NEXT: s_mov_b32 s18, s14 diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 9f6d27802e1843..8bc8fbd0e0e846 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -24,7 +24,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; ; GISEL-LABEL: v_pack_b32_v2f16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -56,7 +56,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32_v2f16_sub: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -73,7 +73,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; ; GISEL-LABEL: v_pack_b32_v2f16_sub: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -105,7 +105,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs define amdgpu_kernel void @fptrunc( ; GCN-LABEL: fptrunc: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s7, 0x31016000 ; GCN-NEXT: s_mov_b32 s10, s6 @@ -125,7 +125,7 @@ define amdgpu_kernel void @fptrunc( ; ; GISEL-LABEL: fptrunc: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -147,7 +147,7 @@ define amdgpu_kernel void @fptrunc( define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fabs: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -164,7 +164,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fabs: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 { ; GCN-LABEL: v_pack_b32.fneg: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc @@ -215,7 +215,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; ; GISEL-LABEL: v_pack_b32.fneg: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 8579cbdf47137d..89fef7eead839a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -89,7 +89,7 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) { ; SDAG-VI-LABEL: basic_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 @@ -104,7 +104,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -117,7 +117,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff @@ -132,7 +132,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -156,7 +156,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -176,7 +176,7 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -413,13 +413,13 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) { ; SDAG-VI-LABEL: vec_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s2, s4, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s2, 0 +; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 ; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 @@ -430,24 +430,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; SDAG-GFX9-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; SDAG-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; SDAG-GFX9-NEXT: s_movk_i32 s2, 0xff +; SDAG-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; SDAG-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_movk_i32 s0, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX9-NEXT: v_pk_max_i16 v1, s4, 0 -; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s2 op_sel_hi:[1,0] -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s0 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: vec_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_clause 0x1 -; SDAG-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0 +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0 ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -457,24 +457,24 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_lshr_b32 s3, s4, 16 +; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16 +; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-VI-NEXT: s_max_i32 s4, s4, s2 -; GISEL-VI-NEXT: s_max_i32 s2, s3, s2 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-VI-NEXT: s_max_i32 s2, s2, s3 +; GISEL-VI-NEXT: s_max_i32 s3, s4, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_min_i32 s3, s3, s4 -; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_min_i32 s2, s2, s4 ; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 -; GISEL-VI-NEXT: s_lshl_b32 s2, s2, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s3, s2 +; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 +; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 +; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -483,40 +483,40 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; ; GISEL-GFX9-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c -; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s4 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s4 ; GISEL-GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX9-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX9-NEXT: s_max_i32 s3, s4, 0 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GISEL-GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX9-NEXT: s_max_i32 s0, s1, s0 +; GISEL-GFX9-NEXT: s_max_i32 s1, s4, 0 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GISEL-GFX9-NEXT: s_ashr_i32 s0, s0, 16 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0xff00ff -; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s4 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, 0xff -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX9-NEXT: s_min_i32 s1, s1, s4 +; GISEL-GFX9-NEXT: s_min_i32 s0, s0, 0xff +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: vec_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_clause 0x1 -; GISEL-GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, 0 +; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s4 -; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16 -; GISEL-GFX11-NEXT: s_max_i32 s2, s3, s2 -; GISEL-GFX11-NEXT: s_max_i32 s3, s4, 0 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3 +; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff ; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2 ; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll index 02a6024f858e9f..d5347f829002db 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll @@ -25,7 +25,7 @@ bb: define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 { ; GFX9-LABEL: test_sub_co_sdwa: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll index f0cbeba1cfb743..eb88c790dfe729 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -10,8 +10,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -34,27 +34,30 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %ou define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx0, i32 %idx1) #1 { ; GCN-LABEL: extract_insert_different_dynelt_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[10:11] -; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 +; GCN-NEXT: s_mov_b64 s[4:5], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: s_load_dword s14, s[0:1], 0xf ; GCN-NEXT: s_cmp_eq_u32 s13, 3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 2 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 1 -; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s13, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s14, 1 ; GCN-NEXT: v_mov_b32_e32 v7, v5 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] @@ -84,8 +87,8 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(ptr addrspace(1 define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_elt2_v4i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 @@ -108,19 +111,19 @@ define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, float %val, i32 %idx) #1 { ; GCN-LABEL: extract_insert_same_dynelt_v4f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; GCN-NEXT: s_load_dword s0, s[2:3], 0xd -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[8:11], 0 addr64 glc +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 2797c5b7988810..66c49ba8b734db 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1585,47 +1585,45 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fma_shuffle_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fma_shuffle_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 @@ -1715,7 +1713,7 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out) { ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1729,7 +1727,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 @@ -1743,7 +1741,7 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; ; GFX11-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -4239,8 +4237,8 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2bf16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: s_mov_b32 s3, 0x7060302 @@ -4323,8 +4321,8 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX10-LABEL: fma_shuffle_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 @@ -4406,14 +4404,12 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-LABEL: fma_shuffle_v2bf16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1] +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] ; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5] ; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -4424,43 +4420,43 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_fmac_f32 v11, v12, v9 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_fmac_f32_e32 v1, v12, v4 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 ; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9 ; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX11-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 ; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: v_fmac_f32_e32 v4, v2, v5 -; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v9 -; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo +; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4495,7 +4491,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 02da6deb96f1fe..340f0cdd5d5d07 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -462,9 +462,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-LABEL: name: livevariables_update_missed_block ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 + ; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]](s32), implicit $exec ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -474,7 +474,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %54, 0, implicit $exec + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) @@ -502,14 +502,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %55:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %49:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %41:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 @@ -562,9 +562,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-LABEL: name: nested_waterfalls ; SI: bb.0.entry: ; SI-NEXT: successors: %bb.1(0x80000000) - ; SI-NEXT: liveins: $vgpr0, $sgpr2_sgpr3 + ; SI-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; SI-NEXT: {{ $}} - ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr2_sgpr3 + ; SI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr0_sgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.1.if.then: @@ -635,7 +635,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: bb.5: ; SI-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %28:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) + ; SI-NEXT: [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %22:vreg_64, [[REG_SEQUENCE5]], killed [[REG_SEQUENCE8]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8) ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc ; SI-NEXT: SI_WATERFALL_LOOP %bb.4, implicit $exec ; SI-NEXT: {{ $}} @@ -648,7 +648,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe ; SI-NEXT: {{ $}} ; SI-NEXT: bb.7: ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] - ; SI-NEXT: GLOBAL_STORE_DWORD undef %31:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) + ; SI-NEXT: GLOBAL_STORE_DWORD undef %25:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1) ; SI-NEXT: S_ENDPGM 0 entry: %0 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index c6eff2c5aad798..31000a7e12b57b 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -4,8 +4,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 8 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7] @@ -28,9 +28,9 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[0:1] offset:2 -; GFX906-NEXT: global_store_short v1, v4, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[2:3] offset:2 +; GFX906-NEXT: global_store_short v1, v4, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -52,21 +52,21 @@ bb.2: define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dword v2, v3, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v2, v3, s[6:7] ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v2, s[0:1] +; GFX906-NEXT: global_store_dword v1, v2, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -88,8 +88,8 @@ bb.2: define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -97,16 +97,16 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: global_store_byte v3, v2, s[0:1] offset:4 -; GFX906-NEXT: global_store_dword v3, v1, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: global_store_byte v3, v2, s[2:3] offset:4 +; GFX906-NEXT: global_store_dword v3, v1, s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -128,21 +128,21 @@ bb.2: define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[6:7] ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -164,21 +164,21 @@ bb.2: define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[6:7] ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -200,25 +200,25 @@ bb.2: define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[4:5] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[6:7] offset:16 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[6:7] ; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16 +; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] +; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -240,25 +240,25 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0 -; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s10, -1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:240 -; GFX906-NEXT: s_mov_b32 s14, -1 -; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 -; GFX906-NEXT: s_addc_u32 s13, s13, 0 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[4:5] offset:224 ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[4:5] offset:208 @@ -280,11 +280,11 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] offset:240 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; 4-byte Folded Spill ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[6:7] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[6:7] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[6:7] offset:192 @@ -302,7 +302,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[6:7] ; GFX906-NEXT: .LBB6_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_waitcnt vmcnt(7) +; GFX906-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112 ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:96 @@ -318,11 +318,11 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(7) ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 ; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1] offset:224 @@ -353,9 +353,9 @@ bb.2: define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: repeat_successor: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s8, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c +; GFX906-NEXT: s_load_dword s8, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_cmp_lt_i32 s8, 3 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 @@ -375,7 +375,7 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr ; GFX906-NEXT: .LBB7_5: ; %return.sink.split ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[0:1] +; GFX906-NEXT: global_store_dword v1, v0, s[2:3] ; GFX906-NEXT: .LBB7_6: ; %return ; GFX906-NEXT: s_endpgm entry: @@ -405,7 +405,7 @@ return: define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -460,13 +460,13 @@ bb.3: define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_zeroinit: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 ; GFX906-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX906-NEXT: s_cbranch_execz .LBB9_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 @@ -522,7 +522,7 @@ bb.3: define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_phi_const: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: ; implicit-def: $vgpr3 @@ -631,7 +631,7 @@ bb.3: define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { ; GFX906-LABEL: v8i8_multi_block: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -682,25 +682,25 @@ bb.3: define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v32i8_loop_carried: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 ; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 ; GFX906-NEXT: s_mov_b32 s4, 0x2000604 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v1, s[0:1] -; GFX906-NEXT: s_mov_b64 s[0:1], 0 +; GFX906-NEXT: global_load_dword v1, v1, s[2:3] +; GFX906-NEXT: s_mov_b64 s[2:3], 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v0, v1 ; GFX906-NEXT: .LBB12_1: ; %bb.1 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc -; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX906-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX906-NEXT: s_cbranch_execnz .LBB12_1 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dword v1, v0, s[0:1] @@ -728,9 +728,9 @@ bb.2: define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) { ; GFX906-LABEL: v8i8_multiuse_multiblock: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GFX906-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) @@ -866,5 +866,4 @@ bb.3: ret void } - declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll index d7db68a433319c..1afe5cdea87233 100644 --- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll @@ -18,29 +18,29 @@ declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 i define amdgpu_kernel void @foo(i1 %cmp1) { ; GFX906-LABEL: foo: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_mov_b32 s14, -1 -; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s9 -; GFX906-NEXT: s_addc_u32 s13, s13, 0 -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 -; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:4 -; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:8 -; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:12 -; GFX906-NEXT: s_load_dword s4, s[2:3], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x1c -; GFX906-NEXT: s_mov_b64 s[2:3], exec +; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX906-NEXT: s_mov_b32 s10, -1 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 +; GFX906-NEXT: buffer_load_dword v3, off, s[8:11], 0 +; GFX906-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:4 +; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:8 +; GFX906-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:12 +; GFX906-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1c ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_bitcmp1_b32 s4, 0 -; GFX906-NEXT: s_mul_i32 s0, s0, s1 -; GFX906-NEXT: v_mul_u32_u24_e32 v1, s1, v1 +; GFX906-NEXT: s_mul_i32 s0, s2, s3 +; GFX906-NEXT: v_mul_u32_u24_e32 v1, s3, v1 ; GFX906-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX906-NEXT: v_add_lshl_u32 v2, v0, v2, 4 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_mov_b32 s4, 0 ; GFX906-NEXT: v_mov_b32_e32 v1, v0 ; GFX906-NEXT: s_cselect_b32 s5, 1, 0 +; GFX906-NEXT: s_mov_b64 s[2:3], exec ; GFX906-NEXT: ds_write_b64 v2, v[0:1] ; GFX906-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 ; GFX906-NEXT: s_waitcnt vmcnt(3) diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 5b0b80c56b7b5d..47923fddae3828 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -20,7 +20,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] @@ -101,7 +101,7 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) { define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_2xf16: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vopc_2xf16: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -138,25 +138,25 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { ; GFX1032-LABEL: test_vopc_class: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s4, 0x204 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vopc_class: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s4, 0x204 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 0x204 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 @@ -169,27 +169,27 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 ; GFX1032-LABEL: test_vcmp_vcnd_f16: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s4 ; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo -; GFX1032-NEXT: global_store_short v1, v0, s[0:1] +; GFX1032-NEXT: global_store_short v1, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vcmp_vcnd_f16: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s4 ; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc -; GFX1064-NEXT: global_store_short v1, v0, s[0:1] +; GFX1064-NEXT: global_store_short v1, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %cmp = fcmp oeq half %x, 0x7FF0000000000000 %sel = select i1 %cmp, half 1.0, half %x @@ -200,7 +200,7 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -214,7 +214,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -239,7 +239,7 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -278,7 +278,7 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] @@ -292,7 +292,7 @@ define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] @@ -318,10 +318,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_mask_if: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: ; %if -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dword v0, v0, s[0:1] @@ -331,10 +331,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { ; GFX1064-LABEL: test_mask_if: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: ; %if -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dword v0, v0, s[0:1] @@ -355,7 +355,7 @@ endif: define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_loop_with_if: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 ; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -516,42 +516,42 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr4 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s2, s2, 1 +; GFX1032-NEXT: s_add_i32 s3, s3, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s2, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s3, s3, s5 +; GFX1032-NEXT: s_or_b32 s4, s4, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 -; GFX1032-NEXT: s_or_b32 s4, s5, s4 -; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 s2, s5, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: .LBB11_4: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr2 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -561,10 +561,10 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1064: ; %bb.0: ; %bb ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_6 ; GFX1064-NEXT: ; %bb.1: ; %.preheader -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0 @@ -631,7 +631,7 @@ bb8: define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_addc_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -643,7 +643,7 @@ define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_addc_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -664,7 +664,7 @@ bb: define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subbrev_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -676,7 +676,7 @@ define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) ; ; GFX1064-LABEL: test_subbrev_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -697,7 +697,7 @@ bb: define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { ; GFX1032-LABEL: test_subb_vop2b: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -709,7 +709,7 @@ define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 ; ; GFX1064-LABEL: test_subb_vop2b: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] @@ -730,7 +730,7 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1063,7 +1063,7 @@ bb: define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1077,7 +1077,7 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa ; ; GFX1064-LABEL: test_div_scale_f32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc @@ -1104,33 +1104,31 @@ define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_div_scale_f64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_scale_f64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1064-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -1188,8 +1186,8 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-LABEL: test_div_fmas_f32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s5 @@ -1197,14 +1195,14 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1032-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1032-NEXT: global_store_dword v2, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_div_fmas_f32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s5 @@ -1212,7 +1210,7 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, fl ; GFX1064-NEXT: s_bitcmp1_b32 s7, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f32 v0, s4, v0, v1 -; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] +; GFX1064-NEXT: global_store_dword v2, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 @@ -1223,14 +1221,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1032-LABEL: test_div_fmas_f64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1032-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: v_mov_b32_e32 v2, s10 ; GFX1032-NEXT: v_mov_b32_e32 v3, s11 -; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 s2, 0 ; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 @@ -1240,14 +1238,14 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX1064-LABEL: test_div_fmas_f64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x44 +; GFX1064-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x44 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 ; GFX1064-NEXT: v_mov_b32_e32 v1, s9 ; GFX1064-NEXT: v_mov_b32_e32 v2, s10 ; GFX1064-NEXT: v_mov_b32_e32 v3, s11 -; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 s2, 0 ; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 @@ -1263,9 +1261,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34 +; GFX1032-NEXT: s_mov_b32 null, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1290,9 +1290,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX1064-NEXT: s_mov_b32 null, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1343,7 +1344,7 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 @@ -1362,7 +1363,7 @@ define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) # ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 @@ -1388,13 +1389,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1032-LABEL: test_br_cc_f16: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1032-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1032-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2 ; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1408,13 +1409,13 @@ define amdgpu_kernel void @test_br_cc_f16( ; GFX1064-LABEL: test_br_cc_f16: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: global_load_ushort v1, v0, s[6:7] -; GFX1064-NEXT: global_load_ushort v2, v0, s[0:1] +; GFX1064-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2 ; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2 @@ -1445,12 +1446,12 @@ two: define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { ; GCN-LABEL: test_brcc_i1: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x34 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp0_b32 s0, 0 +; GCN-NEXT: s_bitcmp0_b32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB25_2 ; GCN-NEXT: ; %bb.1: ; %store -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0xde ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -1472,14 +1473,14 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1032-LABEL: test_preserve_condition_undef_flag: ; GFX1032: ; %bb.0: ; %bb0 ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dword s1, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 -; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s0, 0 -; GFX1032-NEXT: s_or_b32 s1, s2, s1 -; GFX1032-NEXT: s_or_b32 s0, s1, s0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s2, 1.0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s3, 1.0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s2, s2, 0 +; GFX1032-NEXT: s_or_b32 s0, s0, s1 +; GFX1032-NEXT: s_or_b32 s0, s0, s2 ; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX1032-NEXT: ; %bb.1: ; %bb1 @@ -1492,11 +1493,11 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-LABEL: test_preserve_condition_undef_flag: ; GFX1064: ; %bb.0: ; %bb0 ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dword s5, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, 1.0 -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s5, 1.0 +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 ; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s4, 0 ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] @@ -1530,7 +1531,7 @@ bb2: define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX1032-NEXT: ; implicit-def: $sgpr1 ; GFX1032-NEXT: ; implicit-def: $sgpr2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1568,7 +1569,7 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; ; GFX1064-LABEL: test_invert_true_phi_cond_break_loop: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX1064-NEXT: ; implicit-def: $sgpr4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1633,7 +1634,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -1648,7 +1649,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) ; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1671,29 +1672,29 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 ; GFX1032-LABEL: test_set_inactive: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v0, 42 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_set_inactive: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v0, 42 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: global_store_dword v1, v0, s[2:3] ; GFX1064-NEXT: s_endpgm %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) store i32 %tmp, ptr addrspace(1) %out @@ -1703,7 +1704,7 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { ; GFX1032-LABEL: test_set_inactive_64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mov_b32_e32 v0, s2 @@ -1717,7 +1718,7 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; ; GFX1064-LABEL: test_set_inactive_64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mov_b32_e32 v0, s2 @@ -2137,7 +2138,7 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -2147,7 +2148,7 @@ define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -2165,26 +2166,26 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i64: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i64: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX1064-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) store i64 %result, ptr addrspace(1) %out @@ -2194,7 +2195,7 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| @@ -2204,7 +2205,7 @@ define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| @@ -2221,25 +2222,25 @@ define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { ; GFX1032-LABEL: test_intr_icmp_i32: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s4 -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_icmp_i32: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s4 -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] ; GFX1064-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) store i32 %result, ptr addrspace(1) %out @@ -2353,7 +2354,7 @@ define amdgpu_ps float @test_ps_live() #0 { define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2373,7 +2374,7 @@ define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, pt ; ; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2470,7 +2471,7 @@ main_body: define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2504,7 +2505,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2565,7 +2566,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-LABEL: fcmp64: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2597,7 +2598,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; ; GFX1064-LABEL: fcmp64: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 @@ -2657,7 +2658,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-LABEL: icmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1032-NEXT: s_sub_i32 s1, 0, s0 @@ -2691,7 +2692,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; ; GFX1064-LABEL: icmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX1064-NEXT: s_sub_i32 s1, 0, s0 @@ -2751,7 +2752,7 @@ if.end2: ; preds = %if.end define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-LABEL: fcmp32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x28 +; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x28 ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 @@ -2783,7 +2784,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; ; GFX1064-LABEL: fcmp32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dword s2, s[2:3], 0x28 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x28 ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 978ac548443f73..e0b320aa4f3727 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -6,7 +6,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -22,7 +22,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -36,7 +36,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_zext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -76,7 +76,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_zext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -91,7 +91,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_zext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load_sext_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -134,7 +134,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; VI-LABEL: widen_i16_constant_load_sext_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -149,7 +149,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar ; ; GFX11-LABEL: widen_i16_constant_load_sext_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -175,7 +175,7 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i17_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -199,7 +199,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i17_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 2 @@ -218,7 +218,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i17_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -247,7 +247,7 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_f16_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -263,7 +263,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_f16_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -275,7 +275,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_f16_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -296,7 +296,7 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -317,7 +317,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_v2i8_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 44 ; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -338,7 +338,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_v2i8_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -368,7 +368,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) %arg) { ; SI-LABEL: no_widen_i16_constant_divergent_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -387,7 +387,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; VI-LABEL: no_widen_i16_constant_divergent_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -404,9 +404,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; ; GFX11-LABEL: no_widen_i16_constant_divergent_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] @@ -433,7 +431,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -448,7 +446,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; VI-LABEL: widen_i1_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -461,7 +459,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { ; ; GFX11-LABEL: widen_i1_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -482,7 +480,7 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) { define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_zextload_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -499,7 +497,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; VI-LABEL: widen_i16_zextload_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -514,7 +512,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) ; ; GFX11-LABEL: widen_i16_zextload_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 @@ -540,7 +538,7 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -558,7 +556,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; VI-LABEL: widen_i1_zext_to_i64_constant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -574,7 +572,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; ; GFX11-LABEL: widen_i1_zext_to_i64_constant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -598,7 +596,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; SI-LABEL: widen_i16_constant32_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[2:3], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s1, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 @@ -614,7 +612,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; VI-LABEL: widen_i16_constant32_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[2:3], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -629,7 +627,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { ; ; GFX11-LABEL: widen_i16_constant32_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -653,7 +651,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) { define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg) { ; SI-LABEL: widen_i16_global_invariant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -669,7 +667,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; VI-LABEL: widen_i16_global_invariant_load: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -683,7 +681,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg ; ; GFX11-LABEL: widen_i16_global_invariant_load: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index d31c9e7e03e793..40e4692a18ec79 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX9-LABEL: workgroup_id_x: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -17,7 +17,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX12-LABEL: workgroup_id_x: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -33,26 +33,24 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry) { ; GFX9-LABEL: workgroup_id_xy: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_and_b32 s4, ttmp7, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, ttmp7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: global_store_dword v1, v2, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: workgroup_id_xy: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX12-NEXT: s_and_b32 s4, ttmp7, 0xffff -; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 -; GFX12-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NEXT: global_store_b32 v1, v2, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v2, v1, s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -67,14 +65,14 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) { ; GFX9-LABEL: workgroup_id_xyz: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_and_b32 s8, ttmp7, 0xffff +; GFX9-NEXT: s_and_b32 s6, ttmp7, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_lshr_b32 s0, ttmp7, 16 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -84,8 +82,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x10 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX12-NEXT: s_and_b32 s2, ttmp7, 0xffff ; GFX12-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: s_lshr_b32 s3, ttmp7, 16 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 638574f359fa0e..26030d873aec9e 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -415,49 +415,33 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; ; GFX9-O3-LABEL: call: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_getpc_b64 s[8:9] +; GFX9-O3-NEXT: s_add_u32 s8, s8, called@rel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s9, s9, called@rel32@hi+12 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: s_getpc_b64 s[22:23] -; GFX9-O3-NEXT: s_add_u32 s22, s22, called@rel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s23, s23, called@rel32@hi+12 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -664,57 +648,42 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; ; GFX9-O3-LABEL: call_i64: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[4:5] +; GFX9-O3-NEXT: s_add_u32 s4, s4, called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s5, s5, called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[2:3] -; GFX9-O3-NEXT: s_add_u32 s2, s2, called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s3, s3, called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -1275,49 +1244,33 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; ; GFX9-O3-LABEL: strict_wwm_call: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_getpc_b64 s[8:9] +; GFX9-O3-NEXT: s_add_u32 s8, s8, strict_wwm_called@rel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s9, s9, strict_wwm_called@rel32@hi+12 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: s_getpc_b64 s[22:23] -; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm @@ -1524,57 +1477,42 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; ; GFX9-O3-LABEL: strict_wwm_call_i64: ; GFX9-O3: ; %bb.0: -; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 -; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 -; GFX9-O3-NEXT: s_mov_b32 s26, -1 -; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 -; GFX9-O3-NEXT: s_add_u32 s24, s24, s9 +; GFX9-O3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX9-O3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX9-O3-NEXT: s_mov_b32 s14, -1 +; GFX9-O3-NEXT: s_mov_b32 s15, 0xe00000 +; GFX9-O3-NEXT: s_add_u32 s12, s12, s3 +; GFX9-O3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-O3-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX9-O3-NEXT: s_mov_b32 s32, 0 -; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX9-O3-NEXT: s_mov_b32 s14, s8 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 +; GFX9-O3-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[4:5] +; GFX9-O3-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 -; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 -; GFX9-O3-NEXT: s_getpc_b64 s[2:3] -; GFX9-O3-NEXT: s_add_u32 s2, s2, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s3, s3, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 -; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 -; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: s_mov_b32 s12, s6 -; GFX9-O3-NEXT: s_mov_b32 s13, s7 -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[12:13] +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[14:15] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4 ; GFX9-O3-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 9fac17f33d0d36..e15fd7f29671a4 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -5,31 +5,31 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -54,33 +54,33 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v3, v7, v3 ; SI-NEXT: v_xor_b32_e32 v2, v6, v2 ; SI-NEXT: v_xor_b32_e32 v1, v5, v1 ; SI-NEXT: v_xor_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: xor_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -107,8 +107,8 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 @@ -133,8 +133,8 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; ; VI-LABEL: xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -165,32 +165,32 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: v_xor_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 glc +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 1, v0 -; SI-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_xor_i1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -216,30 +216,30 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0 define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -263,7 +263,7 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; SI-LABEL: scalar_xor_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -276,7 +276,7 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: scalar_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -292,8 +292,8 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; SI-LABEL: scalar_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[2:3], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -304,10 +304,10 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { ; ; VI-LABEL: scalar_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[2:3], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_not_b32 s2, s4 +; VI-NEXT: s_not_b32 s2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -321,7 +321,7 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -339,7 +339,7 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -360,31 +360,31 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 ; SI-NEXT: v_xor_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: vector_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 @@ -409,8 +409,8 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -424,8 +424,8 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; ; VI-LABEL: scalar_xor_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1] @@ -442,7 +442,7 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; SI-LABEL: scalar_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -456,7 +456,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { ; ; VI-LABEL: scalar_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -473,7 +473,7 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: vector_not_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -492,7 +492,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 ; ; VI-LABEL: vector_not_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -514,7 +514,7 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { ; SI-LABEL: xor_cf: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 @@ -545,7 +545,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 @@ -591,8 +591,8 @@ endif: define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -605,15 +605,15 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; ; VI-LABEL: scalar_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s1, s1, 0xf237b -; VI-NEXT: s_xor_b32 s0, s0, 0x3039 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b32 s3, s3, 0xf237b +; VI-NEXT: s_xor_b32 s2, s2, 0x3039 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 4261135838621753 @@ -624,30 +624,30 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { ; SI-LABEL: scalar_xor_literal_multi_use_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 -; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x13 ; SI-NEXT: s_movk_i32 s8, 0x3039 ; SI-NEXT: s_mov_b32 s9, 0xf237b -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: s_add_u32 s0, s6, 0x3039 -; SI-NEXT: s_addc_u32 s1, s7, 0xf237b +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_add_u32 s0, s2, 0x3039 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_addc_u32 s1, s3, 0xf237b ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: scalar_xor_literal_multi_use_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s2, 0x3039 ; VI-NEXT: s_mov_b32 s3, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -675,8 +675,8 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -688,14 +688,14 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; ; VI-LABEL: scalar_xor_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s0, s0, 63 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_xor_b32 s2, s2, 63 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %or = xor i64 %a, 63 @@ -706,8 +706,8 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { ; SI-LABEL: scalar_xor_neg_inline_imm_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13 -; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -719,14 +719,14 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; ; VI-LABEL: scalar_xor_neg_inline_imm_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %or = xor i64 %a, -8 @@ -737,7 +737,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_i64_neg_inline_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -756,7 +756,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ; ; VI-LABEL: vector_xor_i64_neg_inline_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -777,7 +777,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: vector_xor_literal_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s10, s6 @@ -796,7 +796,7 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; ; VI-LABEL: vector_xor_literal_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll index 28da8ac423107c..f9a7e887ada239 100644 --- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll @@ -4,7 +4,7 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -25,9 +25,9 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -47,8 +47,8 @@ define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -68,8 +68,8 @@ define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 define amdgpu_kernel void @zext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) { ; GCN-LABEL: zext_i16_to_i64_divergent: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[2:3], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index 3b4ebef1529676..2588d88b002b8b 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -525,8 +525,6 @@ } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2 - -attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } !0 = !{} diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index 138106632c1bc8..9939366e855c41 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -56,4 +56,4 @@ define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 3f6f0c909e8bbf..8922a233b1d8fb 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -43,7 +43,7 @@ ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: body: -define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { +define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -66,5 +66,5 @@ bb4: ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index b3ed7376a1ede6..8326d95e0e7f21 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -27,16 +27,10 @@ ; CHECK-NEXT: returnsVoid: true ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } -; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } -; CHECK-NEXT: dispatchID: { reg: '$sgpr8_sgpr9' } -; CHECK-NEXT: workGroupIDX: { reg: '$sgpr10' } -; CHECK-NEXT: workGroupIDY: { reg: '$sgpr11' } -; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr12' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr13' } +; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' } +; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' } -; CHECK-NEXT: workItemIDY: { reg: '$vgpr1' } -; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' } ; CHECK-NEXT: psInputAddr: 0 ; CHECK-NEXT: psInputEnable: 0 ; CHECK-NEXT: mode: diff --git a/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-instruction-bundle.ll b/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-instruction-bundle.ll index ec752c20ef1487..6932216214a634 100644 --- a/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-instruction-bundle.ll +++ b/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-instruction-bundle.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %arg_in_0, ptr addrspace ; CHECK-ORIG-OPS-NEXT: ; %bb.0: ; CHECK-ORIG-OPS-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ; ; CHECK-ORIG-OPS-NEXT: .cfi_undefined 16 -; CHECK-ORIG-OPS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-ORIG-OPS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-ORIG-OPS-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-ORIG-OPS-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-ORIG-OPS-NEXT: v_mov_b32_e32 v1, 0 @@ -39,7 +39,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %arg_in_0, ptr addrspace ; CHECK-USER-OPS-NEXT: ; %bb.0: ; CHECK-USER-OPS-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ; ; CHECK-USER-OPS-NEXT: .cfi_undefined 16 -; CHECK-USER-OPS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; CHECK-USER-OPS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-USER-OPS-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-USER-OPS-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-USER-OPS-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index d9c3c4b17090bd..f8d97c81698cd7 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -8,11 +8,11 @@ declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) # define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s4 ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 @@ -34,12 +34,12 @@ entry: define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFadd: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -58,12 +58,12 @@ entry: define protected amdgpu_kernel void @InferFmax(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFmax: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -82,12 +82,12 @@ entry: define protected amdgpu_kernel void @InferFmin(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFmin: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -106,13 +106,13 @@ entry: define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: s_add_u32 s0, s4, s0 @@ -140,11 +140,11 @@ bb1: define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferPHI: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: s_addc_u32 s1, s5, s1 ; CHECK-NEXT: s_add_u32 s2, s0, -8 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index 38b8ba12f06626..06a8a6fa04828b 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -5,8 +5,8 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 25 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 -; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 +; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1 ; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 @@ -28,7 +28,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 15 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 ; CHECK-NEXT: t6: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0> ; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7 @@ -47,7 +47,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_USHORT_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<65535> @@ -68,7 +68,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 18 nodes: ; CHECK-NEXT: t0: ch,glue = EntryToken -; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %7 +; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0 ; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_UBYTE_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0> ; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<255>